/**************************************************************************************************
*                                                                                                 *
* This file is part of BLASFEO.                                                                   *
*                                                                                                 *
* BLASFEO -- BLAS For Embedded Optimization.                                                      *
* Copyright (C) 2016-2018 by Gianluca Frison.                                                     *
* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl.              *
* All rights reserved.                                                                            *
*                                                                                                 *
* This program is free software: you can redistribute it and/or modify                            *
* it under the terms of the GNU General Public License as published by                            *
* the Free Software Foundation, either version 3 of the License, or                               *
* (at your option) any later version                                                              *.
*                                                                                                 *
* This program is distributed in the hope that it will be useful,                                 *
* but WITHOUT ANY WARRANTY; without even the implied warranty of                                  *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the                                   *
* GNU General Public License for more details.                                                    *
*                                                                                                 *
* You should have received a copy of the GNU General Public License                               *
* along with this program.  If not, see <https://www.gnu.org/licenses/>.                          *
*                                                                                                 *
* The authors designate this particular file as subject to the "Classpath" exception              *
* as provided by the authors in the LICENSE file that accompained this code.                      *
*                                                                                                 *
* Author: Gianluca Frison, gianluca.frison (at) imtek.uni-freiburg.de                             *
*                                                                                                 *
**************************************************************************************************/



#if defined(OS_LINUX) | defined(OS_MAC)

//#define STACKSIZE 96
#define STACKSIZE 64
#define ARG1  %rdi
#define ARG2  %rsi
#define ARG3  %rdx
#define ARG4  %rcx
#define ARG5  %r8
#define ARG6  %r9
#define ARG7  STACKSIZE +  8(%rsp)
#define ARG8  STACKSIZE + 16(%rsp)
#define ARG9  STACKSIZE + 24(%rsp)
#define ARG10 STACKSIZE + 32(%rsp)
#define ARG11 STACKSIZE + 40(%rsp)
#define ARG12 STACKSIZE + 48(%rsp)
#define ARG13 STACKSIZE + 56(%rsp)
#define ARG14 STACKSIZE + 64(%rsp)
#define ARG15 STACKSIZE + 72(%rsp)
#define ARG16 STACKSIZE + 80(%rsp)
#define ARG17 STACKSIZE + 88(%rsp)
#define ARG18 STACKSIZE + 96(%rsp)
#define PROLOGUE \
	subq	$STACKSIZE, %rsp; \
	movq	%rbx,   (%rsp); \
	movq	%rbp,  8(%rsp); \
	movq	%r12, 16(%rsp); \
	movq	%r13, 24(%rsp); \
	movq	%r14, 32(%rsp); \
	movq	%r15, 40(%rsp); \
	vzeroupper;
#define EPILOGUE \
	vzeroupper; \
	movq	  (%rsp), %rbx; \
	movq	 8(%rsp), %rbp; \
	movq	16(%rsp), %r12; \
	movq	24(%rsp), %r13; \
	movq	32(%rsp), %r14; \
	movq	40(%rsp), %r15; \
	addq	$STACKSIZE, %rsp;

#elif defined(OS_WINDOWS)

#define STACKSIZE 256
#define ARG1  %rcx
#define ARG2  %rdx
#define ARG3  %r8
#define ARG4  %r9
#define ARG5  STACKSIZE + 40(%rsp)
#define ARG6  STACKSIZE + 48(%rsp)
#define ARG7  STACKSIZE + 56(%rsp)
#define ARG8  STACKSIZE + 64(%rsp)
#define ARG9  STACKSIZE + 72(%rsp)
#define ARG10 STACKSIZE + 80(%rsp)
#define ARG11 STACKSIZE + 88(%rsp)
#define ARG12 STACKSIZE + 96(%rsp)
#define ARG13 STACKSIZE + 104(%rsp)
#define ARG14 STACKSIZE + 112(%rsp)
#define ARG15 STACKSIZE + 120(%rsp)
#define ARG16 STACKSIZE + 128(%rsp)
#define ARG17 STACKSIZE + 136(%rsp)
#define ARG18 STACKSIZE + 144(%rsp)
#define PROLOGUE \
	subq	$STACKSIZE, %rsp; \
	movq	%rbx,   (%rsp); \
	movq	%rbp,  8(%rsp); \
	movq	%r12, 16(%rsp); \
	movq	%r13, 24(%rsp); \
	movq	%r14, 32(%rsp); \
	movq	%r15, 40(%rsp); \
	movq	%rdi, 48(%rsp); \
	movq	%rsi, 56(%rsp); \
	vmovups	%xmm6, 64(%rsp); \
	vmovups	%xmm7, 80(%rsp); \
	vmovups	%xmm8, 96(%rsp); \
	vmovups	%xmm9, 112(%rsp); \
	vmovups	%xmm10, 128(%rsp); \
	vmovups	%xmm11, 144(%rsp); \
	vmovups	%xmm12, 160(%rsp); \
	vmovups	%xmm13, 176(%rsp); \
	vmovups	%xmm14, 192(%rsp); \
	vmovups	%xmm15, 208(%rsp); \
	vzeroupper;
#define EPILOGUE \
	vzeroupper; \
	movq	  (%rsp), %rbx; \
	movq	 8(%rsp), %rbp; \
	movq	16(%rsp), %r12; \
	movq	24(%rsp), %r13; \
	movq	32(%rsp), %r14; \
	movq	40(%rsp), %r15; \
	movq	48(%rsp), %rdi; \
	movq	56(%rsp), %rsi; \
	vmovups	64(%rsp), %xmm6; \
	vmovups	80(%rsp), %xmm7; \
	vmovups	96(%rsp), %xmm8; \
	vmovups	112(%rsp), %xmm9; \
	vmovups	128(%rsp), %xmm10; \
	vmovups	144(%rsp), %xmm11; \
	vmovups	160(%rsp), %xmm12; \
	vmovups	176(%rsp), %xmm13; \
	vmovups	192(%rsp), %xmm14; \
	vmovups	208(%rsp), %xmm15; \
	addq	$STACKSIZE, %rsp;

#else

#error wrong OS

#endif





#if defined(OS_LINUX) | defined(OS_WINDOWS)
	.text
#elif defined(OS_MAC)
	.section	__TEXT,__text,regular,pure_instructions
#endif





// --- Inner kernels
// namning: _8_<rows-offset-A>_[GEN]_lib8

// All 8 rows

// src matrix aligned

// void INNER_KERNEL_SGECPSC_8_0_LIB8
//
// input arguments:
// r10d   <- k
// r11    <- alpha
// r12    <- A
// r13    <- B
//
#if MACRO_LEVEL>=1
	.macro INNER_KERNEL_SGECPSC_8_0_LIB8
#else
	.p2align 4,,15
#if defined(OS_LINUX)
	.type inner_kernel_sgecpsc_8_0_lib8, @function
inner_kernel_sgecpsc_8_0_lib8:
#elif defined(OS_MAC)
_inner_kernel_sgecpsc_8_0_lib8:
#elif defined(OS_WINDOWS)
	.def inner_kernel_sgecpsc_8_0_lib8; .scl 2; .type 32; .endef
inner_kernel_sgecpsc_8_0_lib8:
#endif
#endif

	vbroadcastss	0(%r11), %ymm15

	cmpl	$3, %r10d
	jle		0f // consider clean-up

	// main loop
	.p2align 3
1: // main loop

	vmovaps		0(%r12), %ymm0
	vmulps		%ymm15, %ymm0, %ymm0
	vmovaps		%ymm0, 0(%r13)
	subl		$4, %r10d

	vmovaps		32(%r12), %ymm0
	vmulps		%ymm15, %ymm0, %ymm0
	vmovaps		%ymm0, 32(%r13)

	vmovaps		64(%r12), %ymm0
	vmulps		%ymm15, %ymm0, %ymm0
	vmovaps		%ymm0, 64(%r13)

	addq		$128, %r12
	addq		$128, %r13

	vmovaps		-32(%r12), %ymm0
	vmulps		%ymm15, %ymm0, %ymm0
	vmovaps		%ymm0, -32(%r13)

	cmpl		$4, %r10d
	jg			1b // main loop

0: // consider clean-up
	cmpl	$0, %r10d
	jle		2f // return

3: // clean-up loop

	vmovaps		0(%r12), %ymm0
	vmulps		%ymm15, %ymm0, %ymm0
	vmovaps		%ymm0, 0(%r13)
	subl		$1, %r10d
	addq		$32, %r12
	addq		$32, %r13

	cmpl		$0, %r10d
	jg			3b // clean-up loop

2: // return

#if MACRO_LEVEL>=1
	.endm
#else
	ret
#if defined(OS_LINUX)
	.size	inner_kernel_sgecpsc_8_0_lib8, .-inner_kernel_sgecpsc_8_0_lib8
#endif
#endif
//
// end static subroutine





// void INNER_KERNEL_SGECP_8_0_LIB8
//
// input arguments:
// r10d   <- k
// r11    <- A
// r12    <- B
//
#if MACRO_LEVEL>=1
	.macro INNER_KERNEL_SGECP_8_0_LIB8
#else
	.p2align 4,,15
#if defined(OS_LINUX)
	.type inner_kernel_sgecp_8_0_lib8, @function
inner_kernel_sgecp_8_0_lib8:
#elif defined(OS_MAC)
_inner_kernel_sgecp_8_0_lib8:
#elif defined(OS_WINDOWS)
	.def inner_kernel_sgecp_8_0_lib8; .scl 2; .type 32; .endef
inner_kernel_sgecp_8_0_lib8:
#endif
#endif

	cmpl	$3, %r10d
	jle		0f // consider clean-up

	// main loop
	.p2align 3
1: // main loop

	vmovaps		0(%r11), %ymm0
	vmovaps		%ymm0, 0(%r12)
	subl		$4, %r10d

	vmovaps		32(%r11), %ymm0
	vmovaps		%ymm0, 32(%r12)

	vmovaps		64(%r11), %ymm0
	vmovaps		%ymm0, 64(%r12)

	addq		$128, %r11
	addq		$128, %r12

	vmovaps		-32(%r11), %ymm0
	vmovaps		%ymm0, -32(%r12)

	cmpl		$4, %r10d
	jg			1b // main loop

0: // consider clean-up
	cmpl	$0, %r10d
	jle		2f // return

3: // clean-up loop

	vmovaps		0(%r11), %ymm0
	vmovaps		%ymm0, 0(%r12)
	subl		$1, %r10d

	addq		$32, %r11
	addq		$32, %r12

	cmpl		$0, %r10d
	jg			3b // clean-up loop

2: // return

#if MACRO_LEVEL>=1
	.endm
#else
	ret

#if defined(OS_LINUX)
	.size	inner_kernel_sgecp_8_0_lib8, .-inner_kernel_sgecp_8_0_lib8
#endif

#endif
//
// end





// src matrix not aligned

// void INNER_KERNEL_SGECPSC_8_1_LIB8
//
// input arguments:
// r10d   <- k
// r11    <- alpha
// r12    <- A
// r13d   <- sda*8*sizeof(float)
// r14    <- B

#if MACRO_LEVEL>=1
	.macro INNER_KERNEL_SGECPSC_8_1_LIB8
#else
	.p2align 4,,15
#if defined(OS_LINUX)
	.type inner_kernel_sgecpsc_8_1_lib8, @function
inner_kernel_sgecpsc_8_1_lib8:
#elif defined(OS_MAC)
_inner_kernel_sgecpsc_8_1_lib8:
#elif defined(OS_WINDOWS)
	.def inner_kernel_sgecpsc_8_1_lib8; .scl 2; .type 32; .endef
inner_kernel_sgecpsc_8_1_lib8:
#endif
#endif

	movq	%r12, %rax // A1 <- A0
	addq	%r13, %rax // A1 <- A0 + 8*sda*sizeof(float)

	vbroadcastss	0(%r11), %ymm15

	cmpl	$3, %r10d
	jle		0f // consider clean-up

	// main loop
	.p2align 3
1: // main loop

	vmovaps		0(%r12), %ymm0
	vmovaps		0(%rax), %ymm1

	vblendps	$0x01, %ymm1, %ymm0, %ymm0
	vpermilps	$0x39, %ymm0, %ymm0
	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
	vblendps	$0x77, %ymm0, %ymm1, %ymm0

	vmulps		%ymm15, %ymm0, %ymm0
	vmovaps		%ymm0, 0(%r14)

	subl		$4, %r10d

	vmovaps		32(%r12), %ymm0
	vmovaps		32(%rax), %ymm1

	vblendps	$0x01, %ymm1, %ymm0, %ymm0
	vpermilps	$0x39, %ymm0, %ymm0
	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
	vblendps	$0x77, %ymm0, %ymm1, %ymm0

	vmulps		%ymm15, %ymm0, %ymm0
	vmovaps		%ymm0, 32(%r14)

	addq		$128, %r12
	addq		$128, %rax

	vmovaps		-64(%r12), %ymm0
	vmovaps		-64(%rax), %ymm1

	vblendps	$0x01, %ymm1, %ymm0, %ymm0
	vpermilps	$0x39, %ymm0, %ymm0
	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
	vblendps	$0x77, %ymm0, %ymm1, %ymm0

	vmulps		%ymm15, %ymm0, %ymm0
	vmovaps		%ymm0, 64(%r14)

	addq		$128, %r14

	vmovaps		-32(%r12), %ymm0
	vmovaps		-32(%rax), %ymm1

	vblendps	$0x01, %ymm1, %ymm0, %ymm0
	vpermilps	$0x39, %ymm0, %ymm0
	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
	vblendps	$0x77, %ymm0, %ymm1, %ymm0

	vmulps		%ymm15, %ymm0, %ymm0
	vmovaps		%ymm0, -32(%r14)

	cmpl		$3, %r10d
	jg			1b // main loop

0: // consider clean-up
	cmpl	$0, %r10d
	jle		2f // return

3: // clean-up loop

	vmovaps		0(%r12), %ymm0
	vmovaps		0(%rax), %ymm1

	vblendps	$0x01, %ymm1, %ymm0, %ymm0
	vpermilps	$0x39, %ymm0, %ymm0
	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
	vblendps	$0x77, %ymm0, %ymm1, %ymm0

	vmulps		%ymm15, %ymm0, %ymm0
	vmovaps		%ymm0, 0(%r14)

	subl		$1, %r10d
	addq		$32, %r12
	addq		$32, %rax
	addq		$32, %r14

	cmpl		$0, %r10d
	jg			3b // clean-up loop

2: // return

#if MACRO_LEVEL>=1
	.endm
#else
	ret

#if defined(OS_LINUX)
	.size	inner_kernel_sgecpsc_8_1_lib8, .-inner_kernel_sgecpsc_8_1_lib8
#endif
#endif

// end





// void INNER_KERNEL_SGECP_8_1_LIB8
//
// input arguments:
// r10d   <- k
// r11    <- A
// r12d   <- 8*sda*sizeof(float)
// r13    <- B

#if MACRO_LEVEL>=1
	.macro INNER_KERNEL_SGECP_8_1_LIB8
#else
	.p2align 4,,15
#if defined(OS_LINUX)
	.type inner_kernel_sgecp_8_1_lib8, @function
inner_kernel_sgecp_8_1_lib8:
#elif defined(OS_MAC)
_inner_kernel_sgecp_8_1_lib8:
#elif defined(OS_WINDOWS)
	.def inner_kernel_sgecp_8_1_lib8; .scl 2; .type 32; .endef
inner_kernel_sgecp_8_1_lib8:
#endif
#endif

	movq	%r11, %rax // A1 <- A0
	addq	%r12, %rax // A1 <- A0 + 8*sda*sizeof(float)

	cmpl	$3, %r10d
	jle		0f // consider clean-up

	// main loop
	.p2align 3
1: // main loop

#if 1
	// load A0
	vmovaps		0(%r11), %ymm0
	// load A1
	vmovaps		0(%rax), %ymm1

	// magic
	vblendps	$0x01, %ymm1, %ymm0, %ymm0
	vpermilps	$0x39, %ymm0, %ymm0
	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
	vblendps	$0x77, %ymm0, %ymm1, %ymm0

	// copy in B
	vmovaps		%ymm0, 0(%r13)
	subl		$4, %r10d

	vmovaps		32(%r11), %ymm0
	vmovaps		32(%rax), %ymm1

	vblendps	$0x01, %ymm1, %ymm0, %ymm0
	vpermilps	$0x39, %ymm0, %ymm0
	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
	vblendps	$0x77, %ymm0, %ymm1, %ymm0

	vmovaps		%ymm0, 32(%r13)

	addq		$128, %r11
	addq		$128, %rax

	vmovaps		-64(%r11), %ymm0
	vmovaps		-64(%rax), %ymm1

	vblendps	$0x01, %ymm1, %ymm0, %ymm0
	vpermilps	$0x39, %ymm0, %ymm0
	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
	vblendps	$0x77, %ymm0, %ymm1, %ymm0
	vmovaps		%ymm0, 64(%r13)
	addq		$128, %r13

	vmovaps		-32(%r11), %ymm0
	vmovaps		-32(%rax), %ymm1
	vblendps	$0x01, %ymm1, %ymm0, %ymm0
	vpermilps	$0x39, %ymm0, %ymm0
	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
	vblendps	$0x77, %ymm0, %ymm1, %ymm0
	vmovaps		%ymm0, -32(%r13)
#else
	vmovups		4(%r11), %ymm0
	vmovups		-28(%rax), %ymm1
	vblendps	$0x80, %ymm1, %ymm0, %ymm0
	vmovaps		%ymm0, 0(%r13)
	subl		$4, %r10d

	vmovups		36(%r11), %ymm0
	vmovups		4(%rax), %ymm1
	vblendps	$0x80, %ymm1, %ymm0, %ymm0
	vmovaps		%ymm0, 32(%r13)
	addq		$128, %r11
	addq		$128, %rax

	vmovups		-60(%r11), %ymm0
	vmovups		-92(%rax), %ymm1
	vblendps	$0x80, %ymm1, %ymm0, %ymm0
	vmovaps		%ymm0, 64(%r13)
	addq		$128, %r13

	vmovups		-28(%r11), %ymm0
	vmovups		-60(%rax), %ymm1
	vblendps	$0x80, %ymm1, %ymm0, %ymm0
	vmovaps		%ymm0, -32(%r13)
#endif

	cmpl		$3, %r10d
	jg			1b // main loop

0: // consider clean-up
	cmpl	$0, %r10d
	jle		2f // return

3: // clean-up loop

	vmovaps		0(%r11), %ymm0
	vmovaps		0(%rax), %ymm1
	vblendps	$0x01, %ymm1, %ymm0, %ymm0
	vpermilps	$0x39, %ymm0, %ymm0
	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
	vblendps	$0x77, %ymm0, %ymm1, %ymm0
	vmovaps		%ymm0, 0(%r13)
	subl		$1, %r10d
	addq		$32, %r11
	addq		$32, %rax
	addq		$32, %r13

	cmpl		$0, %r10d
	jg			3b // clean-up loop

2: // return

#if MACRO_LEVEL>=1
	.endm
#else
	ret

#if defined(OS_LINUX)
	.size	inner_kernel_sgecp_8_1_lib8, .-inner_kernel_sgecp_8_1_lib8
#endif
#endif

// end





// void INNER_KERNEL_SGECPSC_8_2_LIB8
//
// input arguments:
// r10d   <- k
// r11    <- alpha
// r12    <- A
// r13d   <- sda*8*sizeof(float)
// r14    <- B

#if MACRO_LEVEL>=1
	.macro INNER_KERNEL_SGECPSC_8_2_LIB8
#else
	.p2align 4,,15
#if defined(OS_LINUX)
	.type inner_kernel_sgecpsc_8_2_lib8, @function
inner_kernel_sgecpsc_8_2_lib8:
#elif defined(OS_MAC)
_inner_kernel_sgecpsc_8_2_lib8:
#elif defined(OS_WINDOWS)
	.def inner_kernel_sgecpsc_8_2_lib8; .scl 2; .type 32; .endef
inner_kernel_sgecpsc_8_2_lib8:
#endif
#endif

	movq	%r12, %rax // A1 <- A0
	addq	%r13, %rax // A1 <- A0 + 8*sda*sizeof(float)

	vbroadcastss	0(%r11), %ymm15

	cmpl	$3, %r10d
	jle		0f // consider clean-up

	// main loop
	.p2align 3
1: // main loop

	vmovaps		0(%r12), %ymm0
	vmovaps		0(%rax), %ymm1

	vblendps	$0x03, %ymm1, %ymm0, %ymm0
	vpermilps	$0x4e, %ymm0, %ymm0
	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
	vblendps	$0x33, %ymm0, %ymm1, %ymm0

	vmulps		%ymm15, %ymm0, %ymm0
	vmovaps		%ymm0, 0(%r14)

	subl		$4, %r10d

	vmovaps		32(%r12), %ymm0
	vmovaps		32(%rax), %ymm1

	vblendps	$0x03, %ymm1, %ymm0, %ymm0
	vpermilps	$0x4e, %ymm0, %ymm0
	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
	vblendps	$0x33, %ymm0, %ymm1, %ymm0

	vmulps		%ymm15, %ymm0, %ymm0
	vmovaps		%ymm0, 32(%r14)

	addq		$128, %r12
	addq		$128, %rax

	vmovaps		-64(%r12), %ymm0
	vmovaps		-64(%rax), %ymm1

	vblendps	$0x03, %ymm1, %ymm0, %ymm0
	vpermilps	$0x4e, %ymm0, %ymm0
	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
	vblendps	$0x33, %ymm0, %ymm1, %ymm0

	vmulps		%ymm15, %ymm0, %ymm0
	vmovaps		%ymm0, 64(%r14)

	addq		$128, %r14

	vmovaps		-32(%r12), %ymm0
	vmovaps		-32(%rax), %ymm1

	vblendps	$0x03, %ymm1, %ymm0, %ymm0
	vpermilps	$0x4e, %ymm0, %ymm0
	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
	vblendps	$0x33, %ymm0, %ymm1, %ymm0

	vmulps		%ymm15, %ymm0, %ymm0
	vmovaps		%ymm0, -32(%r14)

	cmpl		$3, %r10d
	jg			1b // main loop

0: // consider clean-up
	cmpl	$0, %r10d
	jle		2f // return

3: // clean-up loop

	vmovaps		0(%r12), %ymm0
	vmovaps		0(%rax), %ymm1

	vblendps	$0x03, %ymm1, %ymm0, %ymm0
	vpermilps	$0x4e, %ymm0, %ymm0
	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
	vblendps	$0x33, %ymm0, %ymm1, %ymm0

	vmulps		%ymm15, %ymm0, %ymm0
	vmovaps		%ymm0, 0(%r14)

	subl		$1, %r10d
	addq		$32, %r12
	addq		$32, %rax
	addq		$32, %r14

	cmpl		$0, %r10d
	jg			3b // clean-up loop

2: // return

#if MACRO_LEVEL>=1
	.endm
#else
	ret

#if defined(OS_LINUX)
	.size	inner_kernel_sgecpsc_8_2_lib8, .-inner_kernel_sgecpsc_8_2_lib8
#endif
#endif

// end





// void INNER_KERNEL_SGECP_8_2_LIB8
//
// input arguments:
// r10d   <- k
// r11    <- A
// r12d   <- 8*sda*sizeof(float)
// r13    <- B

#if MACRO_LEVEL>=1
	.macro INNER_KERNEL_SGECP_8_2_LIB8
#else
	.p2align 4,,15
#if defined(OS_LINUX)
	.type inner_kernel_sgecp_8_2_lib8, @function
inner_kernel_sgecp_8_2_lib8:
#elif defined(OS_MAC)
_inner_kernel_sgecp_8_2_lib8:
#elif defined(OS_WINDOWS)
	.def inner_kernel_sgecp_8_2_lib8; .scl 2; .type 32; .endef
inner_kernel_sgecp_8_2_lib8:
#endif
#endif

	movq	%r11, %rax // A1 <- A0
	addq	%r12, %rax // A1 <- A0 + 8*sda*sizeof(float)

	cmpl	$3, %r10d
	jle		0f // consider clean-up

	// main loop
	.p2align 3
1: // main loop

	vmovaps		0(%r11), %ymm0
	vmovaps		0(%rax), %ymm1
	vblendps	$0x03, %ymm1, %ymm0, %ymm0
	vpermilps	$0x4e, %ymm0, %ymm0
	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
	vblendps	$0x33, %ymm0, %ymm1, %ymm0
	vmovaps		%ymm0, 0(%r13)
	subl		$4, %r10d

	vmovaps		32(%r11), %ymm0
	vmovaps		32(%rax), %ymm1
	vblendps	$0x03, %ymm1, %ymm0, %ymm0
	vpermilps	$0x4e, %ymm0, %ymm0
	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
	vblendps	$0x33, %ymm0, %ymm1, %ymm0
	vmovaps		%ymm0, 32(%r13)
	addq		$128, %r11
	addq		$128, %rax

	vmovaps		-64(%r11), %ymm0
	vmovaps		-64(%rax), %ymm1
	vblendps	$0x03, %ymm1, %ymm0, %ymm0
	vpermilps	$0x4e, %ymm0, %ymm0
	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
	vblendps	$0x33, %ymm0, %ymm1, %ymm0
	vmovaps		%ymm0, 64(%r13)
	addq		$128, %r13

	vmovaps		-32(%r11), %ymm0
	vmovaps		-32(%rax), %ymm1
	vblendps	$0x03, %ymm1, %ymm0, %ymm0
	vpermilps	$0x4e, %ymm0, %ymm0
	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
	vblendps	$0x33, %ymm0, %ymm1, %ymm0
	vmovaps		%ymm0, -32(%r13)

	cmpl		$3, %r10d
	jg			1b // main loop

0: // consider clean-up
	cmpl	$0, %r10d
	jle		2f // return

3: // clean-up loop

	vmovaps		0(%r11), %ymm0
	vmovaps		0(%rax), %ymm1
	vblendps	$0x03, %ymm1, %ymm0, %ymm0
	vpermilps	$0x4e, %ymm0, %ymm0
	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
	vblendps	$0x33, %ymm0, %ymm1, %ymm0
	vmovaps		%ymm0, 0(%r13)
	subl		$1, %r10d
	addq		$32, %r11
	addq		$32, %rax
	addq		$32, %r13

	cmpl		$0, %r10d
	jg			3b // clean-up loop

2: // return

#if MACRO_LEVEL>=1
	.endm
#else
	ret

#if defined(OS_LINUX)
	.size	inner_kernel_sgecp_8_2_lib8, .-inner_kernel_sgecp_8_2_lib8
#endif
#endif

// end





// void INNER_KERNEL_SGECPSC_8_3_LIB8
//
// input arguments:
// r10d   <- k
// r11    <- alpha
// r12    <- A
// r13d   <- sda*8*sizeof(float)
// r14    <- B

#if MACRO_LEVEL>=1
	.macro INNER_KERNEL_SGECPSC_8_3_LIB8
#else
	.p2align 4,,15
#if defined(OS_LINUX)
	.type inner_kernel_sgecpsc_8_3_lib8, @function
inner_kernel_sgecpsc_8_3_lib8:
#elif defined(OS_MAC)
_inner_kernel_sgecpsc_8_3_lib8:
#elif defined(OS_WINDOWS)
	.def inner_kernel_sgecpsc_8_3_lib8; .scl 3; .type 32; .endef
inner_kernel_sgecpsc_8_3_lib8:
#endif
#endif

	movq	%r12, %rax // A1 <- A0
	addq	%r13, %rax // A1 <- A0 + 8*sda*sizeof(float)

	vbroadcastss	0(%r11), %ymm15

	cmpl	$3, %r10d
	jle		0f // consider clean-up

	// main loop
	.p2align 3
1: // main loop

	vmovaps		0(%r12), %ymm0
	vmovaps		0(%rax), %ymm1

	vblendps	$0x07, %ymm1, %ymm0, %ymm0
	vpermilps	$0x93, %ymm0, %ymm0
	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
	vblendps	$0x11, %ymm0, %ymm1, %ymm0

	vmulps		%ymm15, %ymm0, %ymm0
	vmovaps		%ymm0, 0(%r14)

	subl		$4, %r10d

	vmovaps		32(%r12), %ymm0
	vmovaps		32(%rax), %ymm1

	vblendps	$0x07, %ymm1, %ymm0, %ymm0
	vpermilps	$0x93, %ymm0, %ymm0
	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
	vblendps	$0x11, %ymm0, %ymm1, %ymm0

	vmulps		%ymm15, %ymm0, %ymm0
	vmovaps		%ymm0, 32(%r14)

	addq		$128, %r12
	addq		$128, %rax

	vmovaps		-64(%r12), %ymm0
	vmovaps		-64(%rax), %ymm1

	vblendps	$0x07, %ymm1, %ymm0, %ymm0
	vpermilps	$0x93, %ymm0, %ymm0
	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
	vblendps	$0x11, %ymm0, %ymm1, %ymm0

	vmulps		%ymm15, %ymm0, %ymm0
	vmovaps		%ymm0, 64(%r14)

	addq		$128, %r14

	vmovaps		-32(%r12), %ymm0
	vmovaps		-32(%rax), %ymm1

	vblendps	$0x07, %ymm1, %ymm0, %ymm0
	vpermilps	$0x93, %ymm0, %ymm0
	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
	vblendps	$0x11, %ymm0, %ymm1, %ymm0

	vmulps		%ymm15, %ymm0, %ymm0
	vmovaps		%ymm0, -32(%r14)

	cmpl		$3, %r10d
	jg			1b // main loop

0: // consider clean-up
	cmpl	$0, %r10d
	jle		2f // return

3: // clean-up loop

	vmovaps		0(%r12), %ymm0
	vmovaps		0(%rax), %ymm1

	vblendps	$0x07, %ymm1, %ymm0, %ymm0
	vpermilps	$0x93, %ymm0, %ymm0
	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
	vblendps	$0x11, %ymm0, %ymm1, %ymm0

	vmulps		%ymm15, %ymm0, %ymm0
	vmovaps		%ymm0, 0(%r14)

	subl		$1, %r10d
	addq		$32, %r12
	addq		$32, %rax
	addq		$32, %r14

	cmpl		$0, %r10d
	jg			3b // clean-up loop

2: // return

#if MACRO_LEVEL>=1
	.endm
#else
	ret

#if defined(OS_LINUX)
	.size	inner_kernel_sgecpsc_8_3_lib8, .-inner_kernel_sgecpsc_8_3_lib8
#endif
#endif

// end





// void INNER_KERNEL_SGECP_8_3_LIB8
//
// input arguments:
// r10d   <- k
// r11    <- A
// r12d   <- 8*sda*sizeof(float)
// r13    <- B

#if MACRO_LEVEL>=1
	.macro INNER_KERNEL_SGECP_8_3_LIB8
#else
	.p2align 4,,15
#if defined(OS_LINUX)
	.type inner_kernel_sgecp_8_3_lib8, @function
inner_kernel_sgecp_8_3_lib8:
#elif defined(OS_MAC)
_inner_kernel_sgecp_8_3_lib8:
#elif defined(OS_WINDOWS)
	.def inner_kernel_sgecp_8_3_lib8; .scl 2; .type 32; .endef
inner_kernel_sgecp_8_3_lib8:
#endif
#endif

	movq	%r11, %rax // A1 <- A0
	addq	%r12, %rax // A1 <- A0 + 8*sda*sizeof(float)

	cmpl	$3, %r10d
	jle		0f // consider clean-up

	// main loop
	.p2align 3
1: // main loop

	vmovaps		0(%r11), %ymm0
	vmovaps		0(%rax), %ymm1
	vblendps	$0x07, %ymm1, %ymm0, %ymm0
	vpermilps	$0x93, %ymm0, %ymm0
	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
	vblendps	$0x11, %ymm0, %ymm1, %ymm0
	vmovaps		%ymm0, 0(%r13)
	subl		$4, %r10d

	vmovaps		32(%r11), %ymm0
	vmovaps		32(%rax), %ymm1
	vblendps	$0x07, %ymm1, %ymm0, %ymm0
	vpermilps	$0x93, %ymm0, %ymm0
	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
	vblendps	$0x11, %ymm0, %ymm1, %ymm0
	vmovaps		%ymm0, 32(%r13)
	addq		$128, %r11
	addq		$128, %rax

	vmovaps		-64(%r11), %ymm0
	vmovaps		-64(%rax), %ymm1
	vblendps	$0x07, %ymm1, %ymm0, %ymm0
	vpermilps	$0x93, %ymm0, %ymm0
	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
	vblendps	$0x11, %ymm0, %ymm1, %ymm0
	vmovaps		%ymm0, 64(%r13)
	addq		$128, %r13

	vmovaps		-32(%r11), %ymm0
	vmovaps		-32(%rax), %ymm1
	vblendps	$0x07, %ymm1, %ymm0, %ymm0
	vpermilps	$0x93, %ymm0, %ymm0
	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
	vblendps	$0x11, %ymm0, %ymm1, %ymm0
	vmovaps		%ymm0, -32(%r13)

	cmpl		$3, %r10d
	jg			1b // main loop

0: // consider clean-up
	cmpl	$0, %r10d
	jle		2f // return

3: // clean-up loop

	vmovaps		0(%r11), %ymm0
	vmovaps		0(%rax), %ymm1
	vblendps	$0x07, %ymm1, %ymm0, %ymm0
	vpermilps	$0x93, %ymm0, %ymm0
	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
	vblendps	$0x11, %ymm0, %ymm1, %ymm0
	vmovaps		%ymm0, 0(%r13)
	subl		$1, %r10d
	addq		$32, %r11
	addq		$32, %rax
	addq		$32, %r13

	cmpl		$0, %r10d
	jg			3b // clean-up loop

2: // return

#if MACRO_LEVEL>=1
	.endm
#else
	ret

#if defined(OS_LINUX)
	.size	inner_kernel_sgecp_8_3_lib8, .-inner_kernel_sgecp_8_3_lib8
#endif
#endif

// end





// void INNER_KERNEL_SGECPSC_8_4_LIB8
//
// input arguments:
// r10d   <- k
// r11    <- alpha
// r12    <- A
// r13d   <- sda*8*sizeof(float)
// r14    <- B

#if MACRO_LEVEL>=1
	.macro INNER_KERNEL_SGECPSC_8_4_LIB8
#else
	.p2align 4,,15
#if defined(OS_LINUX)
	.type inner_kernel_sgecpsc_8_4_lib8, @function
inner_kernel_sgecpsc_8_4_lib8:
#elif defined(OS_MAC)
_inner_kernel_sgecpsc_8_4_lib8:
#elif defined(OS_WINDOWS)
	.def inner_kernel_sgecpsc_8_4_lib8; .scl 3; .type 32; .endef
inner_kernel_sgecpsc_8_4_lib8:
#endif
#endif

	movq	%r12, %rax // A1 <- A0
	addq	%r13, %rax // A1 <- A0 + 8*sda*sizeof(float)

	vbroadcastss	0(%r11), %ymm15

	cmpl	$3, %r10d
	jle		0f // consider clean-up

	// main loop
	.p2align 3
1: // main loop

	vmovaps		16(%r12), %xmm0
	vmovaps		0(%rax), %xmm1
	vinsertf128	$0x01, %xmm1, %ymm0, %ymm0
	vmulps		%ymm15, %ymm0, %ymm0
	vmovaps		%ymm0, 0(%r14)

	subl		$4, %r10d

	vmovaps		48(%r12), %xmm0
	vmovaps		32(%rax), %xmm1
	vinsertf128	$0x01, %xmm1, %ymm0, %ymm0
	vmulps		%ymm15, %ymm0, %ymm0
	vmovaps		%ymm0, 32(%r14)

	addq		$128, %r12
	addq		$128, %rax

	vmovaps		-48(%r12), %xmm0
	vmovaps	-64(%rax), %xmm1
	vinsertf128	$0x01, %xmm1, %ymm0, %ymm0
	vmulps		%ymm15, %ymm0, %ymm0
	vmovaps		%ymm0, 64(%r14)

	addq		$128, %r14

	vmovaps		-16(%r12), %xmm0
	vmovaps		-32(%rax), %xmm1
	vinsertf128	$0x01, %xmm1, %ymm0, %ymm0
	vmulps		%ymm15, %ymm0, %ymm0
	vmovaps		%ymm0, -32(%r14)

	cmpl		$3, %r10d
	jg			1b // main loop

0: // consider clean-up
	cmpl	$0, %r10d
	jle		2f // return

3: // clean-up loop

	vmovaps		16(%r12), %xmm0
	vmovaps		0(%rax), %xmm1
	vinsertf128	$0x01, %xmm1, %ymm0, %ymm0
	vmulps		%ymm15, %ymm0, %ymm0
	vmovaps		%ymm0, 0(%r14)

	subl		$1, %r10d
	addq		$32, %r12
	addq		$32, %rax
	addq		$32, %r14

	cmpl		$0, %r10d
	jg			3b // clean-up loop

2: // return

#if MACRO_LEVEL>=1
	.endm
#else
	ret

#if defined(OS_LINUX)
	.size	inner_kernel_sgecpsc_8_4_lib8, .-inner_kernel_sgecpsc_8_4_lib8
#endif
#endif

// end





// void INNER_KERNEL_SGECP_8_4_LIB8
// subroutine
//
// input arguments:
// r10d   <- k
// r11    <- A
// r12d   <- 8*sda*sizeof(float)
// r13    <- B

#if MACRO_LEVEL>=1
	.macro INNER_KERNEL_SGECP_8_4_LIB8
#else
	.p2align 4,,15
#if defined(OS_LINUX)
	.type inner_kernel_sgecp_8_4_lib8, @function
inner_kernel_sgecp_8_4_lib8:
#elif defined(OS_MAC)
_inner_kernel_sgecp_8_4_lib8:
#elif defined(OS_WINDOWS)
	.def inner_kernel_sgecp_8_4_lib8; .scl 2; .type 32; .endef
inner_kernel_sgecp_8_4_lib8:
#endif
#endif

	movq	%r11, %rax // A1 <- A0
	addq	%r12, %rax // A1 <- A0 + 8*sda*sizeof(float)

	cmpl	$3, %r10d
	jle		0f // consider clean-up

	// main loop
	.p2align 3
1: // main loop

	vmovaps		16(%r11), %xmm0
	vmovaps		0(%rax), %xmm1
	vinsertf128	$0x01, %xmm1, %ymm0, %ymm0
	vmovaps		%ymm0, 0(%r13)

	subl		$4, %r10d

	vmovaps		48(%r11), %xmm0
	vmovaps		32(%rax), %xmm1
	vinsertf128	$0x01, %xmm1, %ymm0, %ymm0
	vmovaps		%ymm0, 32(%r13)

	addq		$128, %r11

	vmovaps		-48(%r11), %xmm0
	vmovaps		64(%rax), %xmm1
	vinsertf128	$0x01, %xmm1, %ymm0, %ymm0
	vmovaps		%ymm0, 64(%r13)

	addq		$128, %rax

	vmovaps		-16(%r11), %xmm0
	vmovaps		-32(%rax), %xmm1
	vinsertf128	$0x01, %xmm1, %ymm0, %ymm0
	vmovaps		%ymm0, 96(%r13)

	addq		$128, %r13

	cmpl		$3, %r10d
	jg			1b // main loop

0: // consider clean-up
	cmpl	$0, %r10d
	jle		2f // return

3: // clean-up loop

	vmovaps		16(%r11), %xmm0
	vmovaps		0(%rax), %xmm1
	vinsertf128	$0x01, %xmm1, %ymm0, %ymm0
	vmovaps		%ymm0, 0(%r13)
	subl		$1, %r10d
	addq		$32, %r11
	addq		$32, %rax
	addq		$32, %r13

	cmpl		$0, %r10d
	jg			3b // clean-up loop

2: // return

#if MACRO_LEVEL>=1
	.endm
#else
	ret

#if defined(OS_LINUX)
	.size	inner_kernel_sgecp_8_4_lib8, .-inner_kernel_sgecp_8_4_lib8
#endif
#endif

// end





// void INNER_KERNEL_SGECPSC_8_5_LIB8
//
// input arguments:
// r10d   <- k
// r11    <- alpha
// r12    <- A
// r13d   <- sda*8*sizeof(float)
// r14    <- B

#if MACRO_LEVEL>=1
	.macro INNER_KERNEL_SGECPSC_8_5_LIB8
#else
	.p2align 4,,15
#if defined(OS_LINUX)
	.type inner_kernel_sgecpsc_8_5_lib8, @function
inner_kernel_sgecpsc_8_5_lib8:
#elif defined(OS_MAC)
_inner_kernel_sgecpsc_8_5_lib8:
#elif defined(OS_WINDOWS)
	.def inner_kernel_sgecpsc_8_5_lib8; .scl 3; .type 32; .endef
inner_kernel_sgecpsc_8_5_lib8:
#endif
#endif

	movq	%r12, %rax // A1 <- A0
	addq	%r13, %rax // A1 <- A0 + 8*sda*sizeof(float)

	vbroadcastss	0(%r11), %ymm15

	cmpl	$3, %r10d
	jle		0f // consider clean-up

	// main loop
	.p2align 3
1: // main loop

	vmovaps		0(%r12), %ymm0
	vmovaps		0(%rax), %ymm1

	vblendps	$0x1f, %ymm1, %ymm0, %ymm0
	vpermilps	$0x39, %ymm0, %ymm0
	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
	vblendps	$0x88, %ymm0, %ymm1, %ymm0

	vmulps		%ymm15, %ymm0, %ymm0
	vmovaps		%ymm0, 0(%r14)

	subl		$4, %r10d

	vmovaps		32(%r12), %ymm0
	vmovaps		32(%rax), %ymm1

	vblendps	$0x1f, %ymm1, %ymm0, %ymm0
	vpermilps	$0x39, %ymm0, %ymm0
	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
	vblendps	$0x88, %ymm0, %ymm1, %ymm0

	vmulps		%ymm15, %ymm0, %ymm0
	vmovaps		%ymm0, 32(%r14)

	addq		$128, %r12
	addq		$128, %rax

	vmovaps		-64(%r12), %ymm0
	vmovaps		-64(%rax), %ymm1

	vblendps	$0x1f, %ymm1, %ymm0, %ymm0
	vpermilps	$0x39, %ymm0, %ymm0
	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
	vblendps	$0x88, %ymm0, %ymm1, %ymm0

	vmulps		%ymm15, %ymm0, %ymm0
	vmovaps		%ymm0, 64(%r14)

	addq		$128, %r14

	vmovaps		-32(%r12), %ymm0
	vmovaps		-32(%rax), %ymm1

	vblendps	$0x1f, %ymm1, %ymm0, %ymm0
	vpermilps	$0x39, %ymm0, %ymm0
	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
	vblendps	$0x88, %ymm0, %ymm1, %ymm0

	vmulps		%ymm15, %ymm0, %ymm0
	vmovaps		%ymm0, -32(%r14)

	cmpl		$3, %r10d
	jg			1b // main loop

0: // consider clean-up
	cmpl	$0, %r10d
	jle		2f // return

3: // clean-up loop

	vmovaps		0(%r12), %ymm0
	vmovaps		0(%rax), %ymm1

	vblendps	$0x1f, %ymm1, %ymm0, %ymm0
	vpermilps	$0x39, %ymm0, %ymm0
	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
	vblendps	$0x88, %ymm0, %ymm1, %ymm0

	vmulps		%ymm15, %ymm0, %ymm0
	vmovaps		%ymm0, 0(%r14)

	subl		$1, %r10d
	addq		$32, %r12
	addq		$32, %rax
	addq		$32, %r14

	cmpl		$0, %r10d
	jg			3b // clean-up loop

2: // return

#if MACRO_LEVEL>=1
	.endm
#else
	ret

#if defined(OS_LINUX)
	.size	inner_kernel_sgecpsc_8_5_lib8, .-inner_kernel_sgecpsc_8_5_lib8
#endif
#endif

// end





// void INNER_KERNEL_SGECP_8_5_LIB8
//
// input arguments:
// r10d   <- k
// r11    <- A
// r12d   <- 8*sda*sizeof(float)
// r13    <- B

#if MACRO_LEVEL>=1
	.macro INNER_KERNEL_SGECP_8_5_LIB8
#else
	.p2align 4,,15
#if defined(OS_LINUX)
	.type inner_kernel_sgecp_8_5_lib8, @function
inner_kernel_sgecp_8_5_lib8:
#elif defined(OS_MAC)
_inner_kernel_sgecp_8_5_lib8:
#elif defined(OS_WINDOWS)
	.def inner_kernel_sgecp_8_5_lib8; .scl 2; .type 32; .endef
inner_kernel_sgecp_8_5_lib8:
#endif
#endif

	movq	%r11, %rax // A1 <- A0
	addq	%r12, %rax // A1 <- A0 + 8*sda*sizeof(float)

	cmpl	$3, %r10d
	jle		0f // consider clean-up

	// main loop
	.p2align 3
1: // main loop

	vmovaps		0(%r11), %ymm0
	vmovaps		0(%rax), %ymm1
	vblendps	$0x1f, %ymm1, %ymm0, %ymm0
	vpermilps	$0x39, %ymm0, %ymm0
	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
	vblendps	$0x88, %ymm0, %ymm1, %ymm0
	vmovaps		%ymm0, 0(%r13)
	subl		$4, %r10d

	vmovaps		32(%r11), %ymm0
	vmovaps		32(%rax), %ymm1
	vblendps	$0x1f, %ymm1, %ymm0, %ymm0
	vpermilps	$0x39, %ymm0, %ymm0
	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
	vblendps	$0x88, %ymm0, %ymm1, %ymm0
	vmovaps		%ymm0, 32(%r13)
	addq		$128, %r11
	addq		$128, %rax

	vmovaps		-64(%r11), %ymm0
	vmovaps		-64(%rax), %ymm1
	vblendps	$0x1f, %ymm1, %ymm0, %ymm0
	vpermilps	$0x39, %ymm0, %ymm0
	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
	vblendps	$0x88, %ymm0, %ymm1, %ymm0
	vmovaps		%ymm0, 64(%r13)
	addq		$128, %r13

	vmovaps		-32(%r11), %ymm0
	vmovaps		-32(%rax), %ymm1
	vblendps	$0x1f, %ymm1, %ymm0, %ymm0
	vpermilps	$0x39, %ymm0, %ymm0
	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
	vblendps	$0x88, %ymm0, %ymm1, %ymm0
	vmovaps		%ymm0, -32(%r13)

	cmpl		$3, %r10d
	jg			1b // main loop

0: // consider clean-up
	cmpl	$0, %r10d
	jle		2f // return

3: // clean-up loop

	vmovaps		0(%r11), %ymm0
	vmovaps		0(%rax), %ymm1
	vblendps	$0x1f, %ymm1, %ymm0, %ymm0
	vpermilps	$0x39, %ymm0, %ymm0
	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
	vblendps	$0x88, %ymm0, %ymm1, %ymm0
	vmovaps		%ymm0, 0(%r13)
	subl		$1, %r10d
	addq		$32, %r11
	addq		$32, %rax
	addq		$32, %r13

	cmpl		$0, %r10d
	jg			3b // clean-up loop

2: // return

#if MACRO_LEVEL>=1
	.endm
#else
	ret

#if defined(OS_LINUX)
	.size	inner_kernel_sgecp_8_5_lib8, .-inner_kernel_sgecp_8_5_lib8
#endif
#endif

// end





// void INNER_KERNEL_SGECPSC_8_6_LIB8
//
// input arguments:
// r10d   <- k
// r11    <- alpha
// r12    <- A
// r13d   <- sda*8*sizeof(float)
// r14    <- B

#if MACRO_LEVEL>=1
	.macro INNER_KERNEL_SGECPSC_8_6_LIB8
#else
	.p2align 4,,15
#if defined(OS_LINUX)
	.type inner_kernel_sgecpsc_8_6_lib8, @function
inner_kernel_sgecpsc_8_6_lib8:
#elif defined(OS_MAC)
_inner_kernel_sgecpsc_8_6_lib8:
#elif defined(OS_WINDOWS)
	.def inner_kernel_sgecpsc_8_6_lib8; .scl 3; .type 32; .endef
inner_kernel_sgecpsc_8_6_lib8:
#endif
#endif

	movq	%r12, %rax // A1 <- A0
	addq	%r13, %rax // A1 <- A0 + 8*sda*sizeof(float)

	vbroadcastss	0(%r11), %ymm15

	cmpl	$3, %r10d
	jle		0f // consider clean-up

	// main loop
	.p2align 3
1: // main loop

	vmovaps		0(%r12), %ymm0
	vmovaps		0(%rax), %ymm1

	vblendps	$0x3f, %ymm1, %ymm0, %ymm0
	vpermilps	$0x4e, %ymm0, %ymm0
	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
	vblendps	$0xcc, %ymm0, %ymm1, %ymm0

	vmulps		%ymm15, %ymm0, %ymm0
	vmovaps		%ymm0, 0(%r14)

	subl		$4, %r10d

	vmovaps		32(%r12), %ymm0
	vmovaps		32(%rax), %ymm1

	vblendps	$0x3f, %ymm1, %ymm0, %ymm0
	vpermilps	$0x4e, %ymm0, %ymm0
	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
	vblendps	$0xcc, %ymm0, %ymm1, %ymm0

	vmulps		%ymm15, %ymm0, %ymm0
	vmovaps		%ymm0, 32(%r14)

	addq		$128, %r12
	addq		$128, %rax

	vmovaps		-64(%r12), %ymm0
	vmovaps		-64(%rax), %ymm1

	vblendps	$0x3f, %ymm1, %ymm0, %ymm0
	vpermilps	$0x4e, %ymm0, %ymm0
	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
	vblendps	$0xcc, %ymm0, %ymm1, %ymm0

	vmulps		%ymm15, %ymm0, %ymm0
	vmovaps		%ymm0, 64(%r14)

	addq		$128, %r14

	vmovaps		-32(%r12), %ymm0
	vmovaps		-32(%rax), %ymm1

	vblendps	$0x3f, %ymm1, %ymm0, %ymm0
	vpermilps	$0x4e, %ymm0, %ymm0
	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
	vblendps	$0xcc, %ymm0, %ymm1, %ymm0

	vmulps		%ymm15, %ymm0, %ymm0
	vmovaps		%ymm0, -32(%r14)

	cmpl		$3, %r10d
	jg			1b // main loop

0: // consider clean-up
	cmpl	$0, %r10d
	jle		2f // return

3: // clean-up loop

	vmovaps		0(%r12), %ymm0
	vmovaps		0(%rax), %ymm1

	vblendps	$0x3f, %ymm1, %ymm0, %ymm0
	vpermilps	$0x4e, %ymm0, %ymm0
	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
	vblendps	$0xcc, %ymm0, %ymm1, %ymm0

	vmulps		%ymm15, %ymm0, %ymm0
	vmovaps		%ymm0, 0(%r14)

	subl		$1, %r10d
	addq		$32, %r12
	addq		$32, %rax
	addq		$32, %r14

	cmpl		$0, %r10d
	jg			3b // clean-up loop

2: // return

#if MACRO_LEVEL>=1
	.endm
#else
	ret

#if defined(OS_LINUX)
	.size	inner_kernel_sgecpsc_8_6_lib8, .-inner_kernel_sgecpsc_8_6_lib8
#endif
#endif

// end





// void INNER_KERNEL_SGECP_8_6_LIB8
// subroutine
//
// input arguments:
// r10d   <- k
// r11    <- A
// r12d   <- 8*sda*sizeof(float)
// r13    <- B

#if MACRO_LEVEL>=1
	.macro INNER_KERNEL_SGECP_8_6_LIB8
#else
	.p2align 4,,15
#if defined(OS_LINUX)
	.type inner_kernel_sgecp_8_6_lib8, @function
inner_kernel_sgecp_8_6_lib8:
#elif defined(OS_MAC)
_inner_kernel_sgecp_8_6_lib8:
#elif defined(OS_WINDOWS)
	.def inner_kernel_sgecp_8_6_lib8; .scl 2; .type 32; .endef
inner_kernel_sgecp_8_6_lib8:
#endif
#endif

	movq	%r11, %rax // A1 <- A0
	addq	%r12, %rax // A1 <- A0 + 8*sda*sizeof(float)

	cmpl	$3, %r10d
	jle		0f // consider clean-up

	// main loop
	.p2align 3
1: // main loop

	vmovaps		0(%r11), %ymm0
	vmovaps		0(%rax), %ymm1
	vblendps	$0x3f, %ymm1, %ymm0, %ymm0
	vpermilps	$0x4e, %ymm0, %ymm0
	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
	vblendps	$0xcc, %ymm0, %ymm1, %ymm0
	vmovaps		%ymm0, 0(%r13)
	subl		$4, %r10d

	vmovaps		32(%r11), %ymm0
	vmovaps		32(%rax), %ymm1
	vblendps	$0x3f, %ymm1, %ymm0, %ymm0
	vpermilps	$0x4e, %ymm0, %ymm0
	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
	vblendps	$0xcc, %ymm0, %ymm1, %ymm0
	vmovaps		%ymm0, 32(%r13)
	addq		$128, %r11
	addq		$128, %rax

	vmovaps		-64(%r11), %ymm0
	vmovaps		-64(%rax), %ymm1
	vblendps	$0x3f, %ymm1, %ymm0, %ymm0
	vpermilps	$0x4e, %ymm0, %ymm0
	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
	vblendps	$0xcc, %ymm0, %ymm1, %ymm0
	vmovaps		%ymm0, 64(%r13)
	addq		$128, %r13

	vmovaps		-32(%r11), %ymm0
	vmovaps		-32(%rax), %ymm1
	vblendps	$0x3f, %ymm1, %ymm0, %ymm0
	vpermilps	$0x4e, %ymm0, %ymm0
	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
	vblendps	$0xcc, %ymm0, %ymm1, %ymm0
	vmovaps		%ymm0, -32(%r13)

	cmpl		$3, %r10d
	jg			1b // main loop

0: // consider clean-up
	cmpl	$0, %r10d
	jle		2f // return

3: // clean-up loop

	vmovaps		0(%r11), %ymm0
	vmovaps		0(%rax), %ymm1
	vblendps	$0x3f, %ymm1, %ymm0, %ymm0
	vpermilps	$0x4e, %ymm0, %ymm0
	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
	vblendps	$0xcc, %ymm0, %ymm1, %ymm0
	vmovaps		%ymm0, 0(%r13)
	subl		$1, %r10d
	addq		$32, %r11
	addq		$32, %rax
	addq		$32, %r13

	cmpl		$0, %r10d
	jg			3b // clean-up loop

2: // return

#if MACRO_LEVEL>=1
	.endm
#else
	ret

#if defined(OS_LINUX)
	.size	inner_kernel_sgecp_8_6_lib8, .-inner_kernel_sgecp_8_6_lib8
#endif
#endif

// end





// void INNER_KERNEL_SGECPSC_8_7_LIB8
//
// input arguments:
// r10d   <- k
// r11    <- alpha
// r12    <- A
// r13d   <- sda*8*sizeof(float)
// r14    <- B

#if MACRO_LEVEL>=1
	.macro INNER_KERNEL_SGECPSC_8_7_LIB8
#else
	.p2align 4,,15
#if defined(OS_LINUX)
	.type inner_kernel_sgecpsc_8_7_lib8, @function
inner_kernel_sgecpsc_8_7_lib8:
#elif defined(OS_MAC)
_inner_kernel_sgecpsc_8_7_lib8:
#elif defined(OS_WINDOWS)
	.def inner_kernel_sgecpsc_8_7_lib8; .scl 3; .type 32; .endef
inner_kernel_sgecpsc_8_7_lib8:
#endif
#endif

	movq	%r12, %rax // A1 <- A0
	addq	%r13, %rax // A1 <- A0 + 8*sda*sizeof(float)

	vbroadcastss	0(%r11), %ymm15

	cmpl	$3, %r10d
	jle		0f // consider clean-up

	// main loop
	.p2align 3
1: // main loop

	vmovaps		0(%r12), %ymm0
	vmovaps		0(%rax), %ymm1

	vblendps	$0x7f, %ymm1, %ymm0, %ymm0
	vpermilps	$0x93, %ymm0, %ymm0
	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
	vblendps	$0xee, %ymm0, %ymm1, %ymm0

	vmulps		%ymm15, %ymm0, %ymm0
	vmovaps		%ymm0, 0(%r14)

	subl		$4, %r10d

	vmovaps		32(%r12), %ymm0
	vmovaps		32(%rax), %ymm1

	vblendps	$0x7f, %ymm1, %ymm0, %ymm0
	vpermilps	$0x93, %ymm0, %ymm0
	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
	vblendps	$0xee, %ymm0, %ymm1, %ymm0

	vmulps		%ymm15, %ymm0, %ymm0
	vmovaps		%ymm0, 32(%r14)

	addq		$128, %r12
	addq		$128, %rax

	vmovaps		-64(%r12), %ymm0
	vmovaps		-64(%rax), %ymm1

	vblendps	$0x7f, %ymm1, %ymm0, %ymm0
	vpermilps	$0x93, %ymm0, %ymm0
	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
	vblendps	$0xee, %ymm0, %ymm1, %ymm0

	vmulps		%ymm15, %ymm0, %ymm0
	vmovaps		%ymm0, 64(%r14)

	addq		$128, %r14

	vmovaps		-32(%r12), %ymm0
	vmovaps		-32(%rax), %ymm1

	vblendps	$0x7f, %ymm1, %ymm0, %ymm0
	vpermilps	$0x93, %ymm0, %ymm0
	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
	vblendps	$0xee, %ymm0, %ymm1, %ymm0

	vmulps		%ymm15, %ymm0, %ymm0
	vmovaps		%ymm0, -32(%r14)

	cmpl		$3, %r10d
	jg			1b // main loop

0: // consider clean-up
	cmpl	$0, %r10d
	jle		2f // return

3: // clean-up loop

	vmovaps		0(%r12), %ymm0
	vmovaps		0(%rax), %ymm1

	vblendps	$0x7f, %ymm1, %ymm0, %ymm0
	vpermilps	$0x93, %ymm0, %ymm0
	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
	vblendps	$0xee, %ymm0, %ymm1, %ymm0

	vmulps		%ymm15, %ymm0, %ymm0
	vmovaps		%ymm0, 0(%r14)

	subl		$1, %r10d
	addq		$32, %r12
	addq		$32, %rax
	addq		$32, %r14

	cmpl		$0, %r10d
	jg			3b // clean-up loop

2: // return

#if MACRO_LEVEL>=1
	.endm
#else
	ret

#if defined(OS_LINUX)
	.size	inner_kernel_sgecpsc_8_7_lib8, .-inner_kernel_sgecpsc_8_7_lib8
#endif
#endif

// end





// void INNER_KERNEL_SGECP_8_7_LIB8
// subroutine
//
// input arguments:
// r10d   <- k
// r11    <- A
// r12d   <- 8*sda*sizeof(float)
// r13    <- B

#if MACRO_LEVEL>=1
	.macro INNER_KERNEL_SGECP_8_7_LIB8
#else
	.p2align 4,,15
#if defined(OS_LINUX)
	.type inner_kernel_sgecp_8_7_lib8, @function
inner_kernel_sgecp_8_7_lib8:
#elif defined(OS_MAC)
_inner_kernel_sgecp_8_7_lib8:
#elif defined(OS_WINDOWS)
	.def inner_kernel_sgecp_8_7_lib8; .scl 2; .type 32; .endef
inner_kernel_sgecp_8_7_lib8:
#endif
#endif

	movq	%r11, %rax // A1 <- A0
	addq	%r12, %rax // A1 <- A0 + 8*sda*sizeof(float)

	cmpl	$3, %r10d
	jle		0f // consider clean-up

	// main loop
	.p2align 3
1: // main loop

	vmovaps		0(%r11), %ymm0
	vmovaps		0(%rax), %ymm1
	vblendps	$0x7f, %ymm1, %ymm0, %ymm0
	vpermilps	$0x93, %ymm0, %ymm0
	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
	vblendps	$0xee, %ymm0, %ymm1, %ymm0
	vmovaps		%ymm0, 0(%r13)
	subl		$4, %r10d

	vmovaps		32(%r11), %ymm0
	vmovaps		32(%rax), %ymm1
	vblendps	$0x7f, %ymm1, %ymm0, %ymm0
	vpermilps	$0x93, %ymm0, %ymm0
	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
	vblendps	$0xee, %ymm0, %ymm1, %ymm0
	vmovaps		%ymm0, 32(%r13)
	addq		$128, %r11
	addq		$128, %rax

	vmovaps		-64(%r11), %ymm0
	vmovaps		-64(%rax), %ymm1
	vblendps	$0x7f, %ymm1, %ymm0, %ymm0
	vpermilps	$0x93, %ymm0, %ymm0
	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
	vblendps	$0xee, %ymm0, %ymm1, %ymm0
	vmovaps		%ymm0, 64(%r13)
	addq		$128, %r13

	vmovaps		-32(%r11), %ymm0
	vmovaps		-32(%rax), %ymm1
	vblendps	$0x7f, %ymm1, %ymm0, %ymm0
	vpermilps	$0x93, %ymm0, %ymm0
	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
	vblendps	$0xee, %ymm0, %ymm1, %ymm0
	vmovaps		%ymm0, -32(%r13)

	cmpl		$3, %r10d
	jg			1b // main loop

0: // consider clean-up
	cmpl	$0, %r10d
	jle		2f // return

3: // clean-up loop

	vmovaps		0(%r11), %ymm0
	vmovaps		0(%rax), %ymm1
	vblendps	$0x7f, %ymm1, %ymm0, %ymm0
	vpermilps	$0x93, %ymm0, %ymm0
	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
	vblendps	$0xee, %ymm0, %ymm1, %ymm0
	vmovaps		%ymm0, 0(%r13)
	subl		$1, %r10d
	addq		$32, %r11
	addq		$32, %rax
	addq		$32, %r13

	cmpl		$0, %r10d
	jg			3b // clean-up loop

2: // return

#if MACRO_LEVEL>=1
	.endm
#else
	ret

#if defined(OS_LINUX)
	.size	inner_kernel_sgecp_8_7_lib8, .-inner_kernel_sgecp_8_7_lib8
#endif
#endif

// end





// Variable number of rows

// src matrix aligned

// void INNER_KERNEL_SGECPSC_8_0_GEN_LIB8
//
// input arguments:
// r10d   <- k
// r11    <- alpha
// r12    <- A
// r13    <- B
// r14d   <- m1
//
#if MACRO_LEVEL>=1
	.macro INNER_KERNEL_SGECPSC_8_0_GEN_LIB8
#else
	.p2align 4,,15
#if defined(OS_LINUX)
	.type inner_kernel_sgecpsc_8_0_gen_lib8, @function
inner_kernel_sgecpsc_8_0_gen_lib8:
#elif defined(OS_MAC)
_inner_kernel_sgecpsc_8_0_gen_lib8:
#elif defined(OS_WINDOWS)
	.def inner_kernel_sgecpsc_8_0_gen_lib8; .scl 2; .type 32; .endef
inner_kernel_sgecpsc_8_0_gen_lib8:
#endif
#endif

	// compute mask for rows
	vcvtsi2ss	%r14d, %xmm15, %xmm15
#if defined(OS_LINUX) | defined(OS_WINDOWS)
	vmovups		.LC00(%rip), %ymm12
#elif defined(OS_MAC)
	vmovups		LC00(%rip), %ymm12
#endif
	vshufps		$0x00, %xmm15, %xmm15, %xmm15
	vinsertf128	$0x1, %xmm15, %ymm15, %ymm15
	vsubps		%ymm15, %ymm12, %ymm15

	vbroadcastss	0(%r11), %ymm14

	cmpl	$3, %r10d
	jle		0f // consider clean-up

	// main loop
	.p2align 3
1: // main loop

	vmovaps		0(%r12), %ymm0
	vmulps		%ymm14, %ymm0, %ymm0
	vmaskmovps	%ymm0, %ymm15,  0(%r13)
	subl		$4, %r10d

	vmovaps		32(%r12), %ymm0
	vmulps		%ymm14, %ymm0, %ymm0
	vmaskmovps	%ymm0, %ymm15,  32(%r13)

	vmovaps		64(%r12), %ymm0
	vmulps		%ymm14, %ymm0, %ymm0
	vmaskmovps	%ymm0, %ymm15,  64(%r13)

	addq		$128, %r12
	addq		$128, %r13

	vmovaps		-32(%r12), %ymm0
	vmulps		%ymm14, %ymm0, %ymm0
	vmaskmovps	%ymm0, %ymm15,  -32(%r13)

	cmpl		$4, %r10d
	jg			1b // main loop

0: // consider clean-up
	cmpl	$0, %r10d
	jle		2f // return

3: // clean-up loop

	vmovaps		0(%r12), %ymm0
	vmulps		%ymm14, %ymm0, %ymm0
	vmaskmovps	%ymm0, %ymm15,  0(%r13)
	subl		$1, %r10d

	addq		$32, %r12
	addq		$32, %r13

	cmpl		$0, %r10d
	jg			3b // clean-up loop

2: // return

#if MACRO_LEVEL>=1
	.endm
#else
	ret

#if defined(OS_LINUX)
	.size	inner_kernel_sgecpsc_8_0_gen_lib8, .-inner_kernel_sgecpsc_8_0_gen_lib8
#endif

#endif
//
// end static subroutine





// void INNER_KERNEL_SGECPSC_8_0_GEN_U_LIB8
//
// input arguments:
// r10d   <- k
// r11    <- alpha
// r12    <- A
// r13    <- B
// r14d   <- m1
//
#if MACRO_LEVEL>=1
	.macro INNER_KERNEL_SGECPSC_8_0_GEN_U_LIB8
#else
	.p2align 4,,15
#if defined(OS_LINUX)
	.type inner_kernel_sgecpsc_8_0_gen_u_lib8, @function
inner_kernel_sgecpsc_8_0_gen_u_lib8:
#elif defined(OS_MAC)
_inner_kernel_sgecpsc_8_0_gen_u_lib8:
#elif defined(OS_WINDOWS)
	.def inner_kernel_sgecpsc_8_0_gen_u_lib8; .scl 2; .type 32; .endef
inner_kernel_sgecpsc_8_0_gen_u_lib8:
#endif
#endif

	// compute mask for rows
	vcvtsi2ss	%r14d, %xmm15, %xmm15
#if defined(OS_LINUX) | defined(OS_WINDOWS)
	vmovups		.LC00(%rip), %ymm12
#elif defined(OS_MAC)
	vmovups		LC00(%rip), %ymm12
#endif
	vshufps		$0x00, %xmm15, %xmm15, %xmm15
	vinsertf128	$0x1, %xmm15, %ymm15, %ymm15
	vsubps		%ymm15, %ymm12, %ymm15

	vbroadcastss	0(%r11), %ymm14

	cmpl	$3, %r10d
	jle		0f // consider clean-up

	// main loop
	.p2align 3
1: // main loop

	vmovups		0(%r12), %ymm0
	vmulps		%ymm14, %ymm0, %ymm0
	vmaskmovps	%ymm0, %ymm15,  0(%r13)
	subl		$4, %r10d

	vmovups		32(%r12), %ymm0
	vmulps		%ymm14, %ymm0, %ymm0
	vmaskmovps	%ymm0, %ymm15,  32(%r13)

	vmovups		64(%r12), %ymm0
	vmulps		%ymm14, %ymm0, %ymm0
	vmaskmovps	%ymm0, %ymm15,  64(%r13)

	addq		$128, %r12
	addq		$128, %r13

	vmovups		-32(%r12), %ymm0
	vmulps		%ymm14, %ymm0, %ymm0
	vmaskmovps	%ymm0, %ymm15,  -32(%r13)

	cmpl		$4, %r10d
	jg			1b // main loop

0: // consider clean-up
	cmpl	$0, %r10d
	jle		2f // return

3: // clean-up loop

	vmovups		0(%r12), %ymm0
	vmulps		%ymm14, %ymm0, %ymm0
	vmaskmovps	%ymm0, %ymm15,  0(%r13)
	subl		$1, %r10d

	addq		$32, %r12
	addq		$32, %r13

	cmpl		$0, %r10d
	jg			3b // clean-up loop

2: // return

#if MACRO_LEVEL>=1
	.endm
#else
	ret

#if defined(OS_LINUX)
	.size	inner_kernel_sgecpsc_8_0_gen_u_lib8, .-inner_kernel_sgecpsc_8_0_gen_u_lib8
#endif

#endif
//
// end static subroutine





// void INNER_KERNEL_SGECP_8_0_GEN_LIB8
//
// input arguments:
// r10d   <- k
// r11    <- A
// r12    <- B
// r13d   <- m1
//
#if MACRO_LEVEL>=1
	.macro INNER_KERNEL_SGECP_8_0_GEN_LIB8
#else
	.p2align 4,,15
#if defined(OS_LINUX)
	.type inner_kernel_sgecp_8_0_gen_lib8, @function
inner_kernel_sgecp_8_0_gen_lib8:
#elif defined(OS_MAC)
_inner_kernel_sgecp_8_0_gen_lib8:
#elif defined(OS_WINDOWS)
	.def inner_kernel_sgecp_8_0_gen_lib8; .scl 2; .type 32; .endef
inner_kernel_sgecp_8_0_gen_lib8:
#endif
#endif

	// compute mask for rows
	vcvtsi2ss	%r13d, %xmm15, %xmm15
#if defined(OS_LINUX) | defined(OS_WINDOWS)
	vmovups		.LC00(%rip), %ymm12
#elif defined(OS_MAC)
	vmovups		LC00(%rip), %ymm12
#endif
	vshufps		$0x00, %xmm15, %xmm15, %xmm15
	vinsertf128	$0x1, %xmm15, %ymm15, %ymm15
	vsubps		%ymm15, %ymm12, %ymm15

	cmpl	$3, %r10d
	jle		0f // consider clean-up

	// main loop
	.p2align 3
1: // main loop

	vmovaps		0(%r11), %ymm0
	vmaskmovps	%ymm0, %ymm15,  0(%r12)
	subl		$4, %r10d

	vmovaps		32(%r11), %ymm0
	vmaskmovps	%ymm0, %ymm15,  32(%r12)

	vmovaps		64(%r11), %ymm0
	vmaskmovps	%ymm0, %ymm15,  64(%r12)

	addq		$128, %r12
	addq		$128, %r11

	vmovaps		-32(%r11), %ymm0
	vmaskmovps	%ymm0, %ymm15,  -32(%r12)

	cmpl		$4, %r10d
	jg			1b // main loop

0: // consider clean-up
	cmpl	$0, %r10d
	jle		2f // return

3: // clean-up loop

	vmovaps		0(%r11), %ymm0
	vmaskmovps	%ymm0, %ymm15,  0(%r12)
	subl		$1, %r10d

	addq		$32, %r11
	addq		$32, %r12

	cmpl		$0, %r10d
	jg			3b // clean-up loop

2: // return

#if MACRO_LEVEL>=1
	.endm
#else
	ret

#if defined(OS_LINUX)
	.size	inner_kernel_sgecp_8_0_lib8, .-inner_kernel_sgecp_8_0_lib8
#endif

#endif
//
// end static subroutine





// void INNER_KERNEL_SGECP_8_0_GEN_U_LIB8
//
// input arguments:
// r10d   <- k
// r11    <- A
// r12    <- B
// r13d   <- m1
//
#if MACRO_LEVEL>=1
	.macro INNER_KERNEL_SGECP_8_0_GEN_U_LIB8
#else
	.p2align 4,,15
#if defined(OS_LINUX)
	.type inner_kernel_sgecp_8_0_gen_u_lib8, @function
inner_kernel_sgecp_8_0_gen_u_lib8:
#elif defined(OS_MAC)
_inner_kernel_sgecp_8_0_gen_u_lib8:
#elif defined(OS_WINDOWS)
	.def inner_kernel_sgecp_8_0_gen_u_lib8; .scl 2; .type 32; .endef
inner_kernel_sgecp_8_0_gen_u_lib8:
#endif
#endif

	// compute mask for rows
	vcvtsi2ss	%r13d, %xmm15, %xmm15
#if defined(OS_LINUX) | defined(OS_WINDOWS)
	vmovups		.LC00(%rip), %ymm12
#elif defined(OS_MAC)
	vmovups		LC00(%rip), %ymm12
#endif
	vshufps		$0x00, %xmm15, %xmm15, %xmm15
	vinsertf128	$0x1, %xmm15, %ymm15, %ymm15
	vsubps		%ymm15, %ymm12, %ymm15

	cmpl	$3, %r10d
	jle		0f // consider clean-up

	// main loop
	.p2align 3
1: // main loop

	vmovups		0(%r11), %ymm0
	vmaskmovps	%ymm0, %ymm15,  0(%r12)
	subl		$4, %r10d

	vmovups		32(%r11), %ymm0
	vmaskmovps	%ymm0, %ymm15,  32(%r12)

	vmovups		64(%r11), %ymm0
	vmaskmovps	%ymm0, %ymm15,  64(%r12)

	addq		$128, %r12
	addq		$128, %r11

	vmovups		-32(%r11), %ymm0
	vmaskmovps	%ymm0, %ymm15,  -32(%r12)

	cmpl		$4, %r10d
	jg			1b // main loop

0: // consider clean-up
	cmpl	$0, %r10d
	jle		2f // return

3: // clean-up loop

	vmovups		0(%r11), %ymm0
	vmaskmovps	%ymm0, %ymm15,  0(%r12)
	subl		$1, %r10d

	addq		$32, %r11
	addq		$32, %r12

	cmpl		$0, %r10d
	jg			3b // clean-up loop

2: // return

#if MACRO_LEVEL>=1
	.endm
#else
	ret

#if defined(OS_LINUX)
	.size	inner_kernel_sgecp_8_0_lib8, .-inner_kernel_sgecp_8_0_lib8
#endif

#endif
//
// end static subroutine


// src matrix not aligned

// void INNER_KERNEL_SGECPSC_8_1_GEN_LIB8
//
// input arguments:
// r10d   <- k
// r11    <- alpha
// r12    <- A
// r13d   <- 8*sda*sizeof(float)
// r14    <- B
// r15d   <- m1


#if MACRO_LEVEL>=1
	.macro INNER_KERNEL_SGECPSC_8_1_GEN_LIB8
#else
	.p2align 4,,15
#if defined(OS_LINUX)
	.type inner_kernel_sgecpsc_8_1_gen_lib8, @function
inner_kernel_sgecpsc_8_1_gen_lib8:
#elif defined(OS_MAC)
_inner_kernel_sgecpsc_8_1_gen_lib8:
#elif defined(OS_WINDOWS)
	.def inner_kernel_sgecpsc_8_1_gen_lib8; .scl 2; .type 32; .endef
inner_kernel_sgecpsc_8_1_gen_lib8:
#endif
#endif

	// compute mask for rows
	vcvtsi2ss	%r15d, %xmm15, %xmm15
#if defined(OS_LINUX) | defined(OS_WINDOWS)
	vmovups		.LC00(%rip), %ymm12
#elif defined(OS_MAC)
	vmovups		LC00(%rip), %ymm12
#endif
	vshufps		$0x00, %xmm15, %xmm15, %xmm15
	vinsertf128	$0x1, %xmm15, %ymm15, %ymm15
	vsubps		%ymm15, %ymm12, %ymm15

	movq	%r12, %rax // A1 <- A0
	addq	%r13, %rax // A1 <- A0 + 8*sda*sizeof(float)

	vbroadcastss	0(%r11), %ymm14

	cmpl	$3, %r10d
	jle		0f // consider clean-up

	// main loop
	.p2align 3
1: // main loop

	vmovaps		0(%r12), %ymm0
	vmovaps		0(%rax), %ymm1
	vblendps	$0x01, %ymm1, %ymm0, %ymm0
	vpermilps	$0x39, %ymm0, %ymm0
	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
	vblendps	$0x77, %ymm0, %ymm1, %ymm0
	vmulps		%ymm14, %ymm0, %ymm0
	vmaskmovps	%ymm0, %ymm15, 0(%r14)
	subl		$4, %r10d

	vmovaps		32(%r12), %ymm0
	vmovaps		32(%rax), %ymm1
	vblendps	$0x01, %ymm1, %ymm0, %ymm0
	vpermilps	$0x39, %ymm0, %ymm0
	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
	vblendps	$0x77, %ymm0, %ymm1, %ymm0
	vmulps		%ymm14, %ymm0, %ymm0
	vmaskmovps	%ymm0, %ymm15, 32(%r14)
	addq		$128, %r12
	addq		$128, %rax

	vmovaps		-64(%r12), %ymm0
	vmovaps		-64(%rax), %ymm1
	vblendps	$0x01, %ymm1, %ymm0, %ymm0
	vpermilps	$0x39, %ymm0, %ymm0
	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
	vblendps	$0x77, %ymm0, %ymm1, %ymm0
	vmulps		%ymm14, %ymm0, %ymm0
	vmaskmovps	%ymm0, %ymm15, 64(%r14)
	addq		$128, %r14

	vmovaps		-32(%r12), %ymm0
	vmovaps		-32(%rax), %ymm1
	vblendps	$0x01, %ymm1, %ymm0, %ymm0
	vpermilps	$0x39, %ymm0, %ymm0
	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
	vblendps	$0x77, %ymm0, %ymm1, %ymm0
	vmulps		%ymm14, %ymm0, %ymm0
	vmaskmovps	%ymm0, %ymm15, -32(%r14)

	cmpl		$3, %r10d
	jg			1b // main loop

0: // consider clean-up
	cmpl	$0, %r10d
	jle		2f // return

3: // clean-up loop

	vmovaps		0(%r12), %ymm0
	vmovaps		0(%rax), %ymm1
	vblendps	$0x01, %ymm1, %ymm0, %ymm0
	vpermilps	$0x39, %ymm0, %ymm0
	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
	vblendps	$0x77, %ymm0, %ymm1, %ymm0
	vmulps		%ymm14, %ymm0, %ymm0
	vmaskmovps	%ymm0, %ymm15, 0(%r14)
	subl		$1, %r10d
	addq		$32, %r12
	addq		$32, %rax
	addq		$32, %r14

	cmpl		$0, %r10d
	jg			3b // clean-up loop

2: // return

#if MACRO_LEVEL>=1
	.endm
#else
	ret

#if defined(OS_LINUX)
	.size	inner_kernel_sgecpsc_8_1_gen_lib8, .-inner_kernel_sgecpsc_8_1_gen_lib8
#endif
#endif

// end





// void INNER_KERNEL_SGECP_8_1_GEN_LIB8
//
// input arguments:
// r10d   <- k
// r11    <- A
// r12d   <- 8*sda*sizeof(float)
// r13    <- B
// r14d   <- m1

#if MACRO_LEVEL>=1
	.macro INNER_KERNEL_SGECP_8_1_GEN_LIB8
#else
	.p2align 4,,15
#if defined(OS_LINUX)
	.type inner_kernel_sgecp_8_1_gen_lib8, @function
inner_kernel_sgecp_8_1_gen_lib8:
#elif defined(OS_MAC)
_inner_kernel_sgecp_8_1_gen_lib8:
#elif defined(OS_WINDOWS)
	.def inner_kernel_sgecp_8_1_gen_lib8; .scl 2; .type 32; .endef
inner_kernel_sgecp_8_1_gen_lib8:
#endif
#endif

	// compute mask for rows
	vcvtsi2ss	%r14d, %xmm15, %xmm15
#if defined(OS_LINUX) | defined(OS_WINDOWS)
	vmovups		.LC00(%rip), %ymm12
#elif defined(OS_MAC)
	vmovups		LC00(%rip), %ymm12
#endif
	vshufps		$0x00, %xmm15, %xmm15, %xmm15
	vinsertf128	$0x1, %xmm15, %ymm15, %ymm15
	vsubps		%ymm15, %ymm12, %ymm15

	movq	%r11, %rax // A1 <- A0
	addq	%r12, %rax // A1 <- A0 + 8*sda*sizeof(float)

	cmpl	$3, %r10d
	jle		0f // consider clean-up

	// main loop
	.p2align 3
1: // main loop

	vmovaps		0(%r11), %ymm0
	vmovaps		0(%rax), %ymm1
	vblendps	$0x01, %ymm1, %ymm0, %ymm0
	vpermilps	$0x39, %ymm0, %ymm0
	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
	vblendps	$0x77, %ymm0, %ymm1, %ymm0
	vmaskmovps	%ymm0, %ymm15, 0(%r13)
	subl		$4, %r10d

	vmovaps		32(%r11), %ymm0
	vmovaps		32(%rax), %ymm1
	vblendps	$0x01, %ymm1, %ymm0, %ymm0
	vpermilps	$0x39, %ymm0, %ymm0
	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
	vblendps	$0x77, %ymm0, %ymm1, %ymm0
	vmaskmovps	%ymm0, %ymm15, 32(%r13)
	addq		$128, %r11
	addq		$128, %rax

	vmovaps		-64(%r11), %ymm0
	vmovaps		-64(%rax), %ymm1
	vblendps	$0x01, %ymm1, %ymm0, %ymm0
	vpermilps	$0x39, %ymm0, %ymm0
	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
	vblendps	$0x77, %ymm0, %ymm1, %ymm0
	vmaskmovps	%ymm0, %ymm15, 64(%r13)
	addq		$128, %r13

	vmovaps		-32(%r11), %ymm0
	vmovaps		-32(%rax), %ymm1
	vblendps	$0x01, %ymm1, %ymm0, %ymm0
	vpermilps	$0x39, %ymm0, %ymm0
	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
	vblendps	$0x77, %ymm0, %ymm1, %ymm0
	vmaskmovps	%ymm0, %ymm15, -32(%r13)

	cmpl		$3, %r10d
	jg			1b // main loop

0: // consider clean-up
	cmpl	$0, %r10d
	jle		2f // return

3: // clean-up loop

	vmovaps		0(%r11), %ymm0
	vmovaps		0(%rax), %ymm1
	vblendps	$0x01, %ymm1, %ymm0, %ymm0
	vpermilps	$0x39, %ymm0, %ymm0
	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
	vblendps	$0x77, %ymm0, %ymm1, %ymm0
	vmaskmovps	%ymm0, %ymm15, 0(%r13)
	subl		$1, %r10d
	addq		$32, %r11
	addq		$32, %rax
	addq		$32, %r13

	cmpl		$0, %r10d
	jg			3b // clean-up loop

2: // return

#if MACRO_LEVEL>=1
	.endm
#else
	ret

#if defined(OS_LINUX)
	.size	inner_kernel_sgecp_8_1_gen_lib8, .-inner_kernel_sgecp_8_1_gen_lib8
#endif
#endif

// end





// void INNER_KERNEL_SGECPSC_8_2_GEN_LIB8
//
// input arguments:
// r10d   <- k
// r11    <- alpha
// r12    <- A
// r13d   <- 8*sda*sizeof(float)
// r14    <- B
// r15d   <- m1

#if MACRO_LEVEL>=1
	.macro INNER_KERNEL_SGECPSC_8_2_GEN_LIB8
#else
	.p2align 4,,15
#if defined(OS_LINUX)
	.type inner_kernel_sgecpsc_8_2_gen_lib8, @function
inner_kernel_sgecpsc_8_2_gen_lib8:
#elif defined(OS_MAC)
_inner_kernel_sgecpsc_8_2_gen_lib8:
#elif defined(OS_WINDOWS)
	.def inner_kernel_sgecpsc_8_2_gen_lib8; .scl 2; .type 32; .endef
inner_kernel_sgecpsc_8_2_gen_lib8:
#endif
#endif

	// compute mask for rows
	vcvtsi2ss	%r15d, %xmm15, %xmm15
#if defined(OS_LINUX) | defined(OS_WINDOWS)
	vmovups		.LC00(%rip), %ymm12
#elif defined(OS_MAC)
	vmovups		LC00(%rip), %ymm12
#endif
	vshufps		$0x00, %xmm15, %xmm15, %xmm15
	vinsertf128	$0x1, %xmm15, %ymm15, %ymm15
	vsubps		%ymm15, %ymm12, %ymm15

	movq	%r12, %rax // A1 <- A0
	addq	%r13, %rax // A1 <- A0 + 8*sda*sizeof(float)

	vbroadcastss	0(%r11), %ymm14

	cmpl	$3, %r10d
	jle		0f // consider clean-up

	// main loop
	.p2align 3
1: // main loop

	vmovaps		0(%r12), %ymm0
	vmovaps		0(%rax), %ymm1

	vblendps	$0x03, %ymm1, %ymm0, %ymm0
	vpermilps	$0x4e, %ymm0, %ymm0
	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
	vblendps	$0x33, %ymm0, %ymm1, %ymm0
	vmulps		%ymm14, %ymm0, %ymm0
	vmaskmovps	%ymm0, %ymm15, 0(%r14)
	subl		$4, %r10d

	vmovaps		32(%r12), %ymm0
	vmovaps		32(%rax), %ymm1
	vblendps	$0x03, %ymm1, %ymm0, %ymm0
	vpermilps	$0x4e, %ymm0, %ymm0
	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
	vblendps	$0x33, %ymm0, %ymm1, %ymm0
	vmulps		%ymm14, %ymm0, %ymm0
	vmaskmovps	%ymm0, %ymm15, 32(%r14)
	addq		$128, %r12
	addq		$128, %rax

	vmovaps		-64(%r12), %ymm0
	vmovaps		-64(%rax), %ymm1
	vblendps	$0x03, %ymm1, %ymm0, %ymm0
	vpermilps	$0x4e, %ymm0, %ymm0
	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
	vblendps	$0x33, %ymm0, %ymm1, %ymm0
	vmulps		%ymm14, %ymm0, %ymm0
	vmaskmovps	%ymm0, %ymm15, 64(%r14)
	addq		$128, %r14

	vmovaps		-32(%r12), %ymm0
	vmovaps		-32(%rax), %ymm1
	vblendps	$0x03, %ymm1, %ymm0, %ymm0
	vpermilps	$0x4e, %ymm0, %ymm0
	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
	vblendps	$0x33, %ymm0, %ymm1, %ymm0
	vmulps		%ymm14, %ymm0, %ymm0
	vmaskmovps	%ymm0, %ymm15, -32(%r14)

	cmpl		$3, %r10d
	jg			1b // main loop

0: // consider clean-up
	cmpl	$0, %r10d
	jle		2f // return

3: // clean-up loop

	vmovaps		0(%r12), %ymm0
	vmovaps		0(%rax), %ymm1
	vblendps	$0x03, %ymm1, %ymm0, %ymm0
	vpermilps	$0x4e, %ymm0, %ymm0
	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
	vblendps	$0x33, %ymm0, %ymm1, %ymm0
	vmulps		%ymm14, %ymm0, %ymm0
	vmaskmovps	%ymm0, %ymm15, 0(%r14)
	subl		$1, %r10d
	addq		$32, %r12
	addq		$32, %rax
	addq		$32, %r14

	cmpl		$0, %r10d
	jg			3b // clean-up loop

2: // return

#if MACRO_LEVEL>=1
	.endm
#else
	ret

#if defined(OS_LINUX)
	.size	inner_kernel_sgecpsc_8_2_gen_lib8, .-inner_kernel_sgecpsc_8_2_gen_lib8
#endif
#endif

// end





// void INNER_KERNEL_SGECP_8_2_GEN_LIB8
//
// input arguments:
// r10d   <- k
// r11    <- A
// r12d   <- 8*sda*sizeof(float)
// r13    <- B
// r14d   <- m1

#if MACRO_LEVEL>=1
	.macro INNER_KERNEL_SGECP_8_2_GEN_LIB8
#else
	.p2align 4,,15
#if defined(OS_LINUX)
	.type inner_kernel_sgecp_8_2_gen_lib8, @function
inner_kernel_sgecp_8_2_gen_lib8:
#elif defined(OS_MAC)
_inner_kernel_sgecp_8_2_gen_lib8:
#elif defined(OS_WINDOWS)
	.def inner_kernel_sgecp_8_2_gen_lib8; .scl 2; .type 32; .endef
inner_kernel_sgecp_8_2_gen_lib8:
#endif
#endif

	// compute mask for rows
	vcvtsi2ss	%r14d, %xmm15, %xmm15
#if defined(OS_LINUX) | defined(OS_WINDOWS)
	vmovups		.LC00(%rip), %ymm12
#elif defined(OS_MAC)
	vmovups		LC00(%rip), %ymm12
#endif
	vshufps		$0x00, %xmm15, %xmm15, %xmm15
	vinsertf128	$0x1, %xmm15, %ymm15, %ymm15
	vsubps		%ymm15, %ymm12, %ymm15

	movq	%r11, %rax // A1 <- A0
	addq	%r12, %rax // A1 <- A0 + 8*sda*sizeof(float)

	cmpl	$3, %r10d
	jle		0f // consider clean-up

	// main loop
	.p2align 3
1: // main loop

	vmovaps		0(%r11), %ymm0
	vmovaps		0(%rax), %ymm1
	vblendps	$0x03, %ymm1, %ymm0, %ymm0
	vpermilps	$0x4e, %ymm0, %ymm0
	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
	vblendps	$0x33, %ymm0, %ymm1, %ymm0
	vmaskmovps	%ymm0, %ymm15, 0(%r13)
	subl		$4, %r10d

	vmovaps		32(%r11), %ymm0
	vmovaps		32(%rax), %ymm1
	vblendps	$0x03, %ymm1, %ymm0, %ymm0
	vpermilps	$0x4e, %ymm0, %ymm0
	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
	vblendps	$0x33, %ymm0, %ymm1, %ymm0
	vmaskmovps	%ymm0, %ymm15, 32(%r13)
	addq		$128, %r11
	addq		$128, %rax

	vmovaps		-64(%r11), %ymm0
	vmovaps		-64(%rax), %ymm1
	vblendps	$0x03, %ymm1, %ymm0, %ymm0
	vpermilps	$0x4e, %ymm0, %ymm0
	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
	vblendps	$0x33, %ymm0, %ymm1, %ymm0
	vmaskmovps	%ymm0, %ymm15, 64(%r13)
	addq		$128, %r13

	vmovaps		-32(%r11), %ymm0
	vmovaps		-32(%rax), %ymm1
	vblendps	$0x03, %ymm1, %ymm0, %ymm0
	vpermilps	$0x4e, %ymm0, %ymm0
	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
	vblendps	$0x33, %ymm0, %ymm1, %ymm0
	vmaskmovps	%ymm0, %ymm15, -32(%r13)

	cmpl		$3, %r10d
	jg			1b // main loop

0: // consider clean-up
	cmpl	$0, %r10d
	jle		2f // return

3: // clean-up loop

	vmovaps		0(%r11), %ymm0
	vmovaps		0(%rax), %ymm1
	vblendps	$0x03, %ymm1, %ymm0, %ymm0
	vpermilps	$0x4e, %ymm0, %ymm0
	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
	vblendps	$0x33, %ymm0, %ymm1, %ymm0
	vmaskmovps	%ymm0, %ymm15, 0(%r13)
	subl		$1, %r10d
	addq		$32, %r11
	addq		$32, %rax
	addq		$32, %r13

	cmpl		$0, %r10d
	jg			3b // clean-up loop

2: // return

#if MACRO_LEVEL>=1
	.endm
#else
	ret

#if defined(OS_LINUX)
	.size	inner_kernel_sgecp_8_2_gen_lib8, .-inner_kernel_sgecp_8_2_gen_lib8
#endif
#endif

// end





// void INNER_KERNEL_SGECPSC_8_3_GEN_LIB8
//
// input arguments:
// r10d   <- k
// r11    <- alpha
// r12    <- A
// r13d   <- 8*sda*sizeof(float)
// r14    <- B
// r15d   <- m1

#if MACRO_LEVEL>=1
	.macro INNER_KERNEL_SGECPSC_8_3_GEN_LIB8
#else
	.p2align 4,,15
#if defined(OS_LINUX)
	.type inner_kernel_sgecpsc_8_3_gen_lib8, @function
inner_kernel_sgecpsc_8_3_gen_lib8:
#elif defined(OS_MAC)
_inner_kernel_sgecpsc_8_3_gen_lib8:
#elif defined(OS_WINDOWS)
	.def inner_kernel_sgecpsc_8_3_gen_lib8; .scl 2; .type 32; .endef
inner_kernel_sgecpsc_8_3_gen_lib8:
#endif
#endif

	// compute mask for rows
	vcvtsi2ss	%r15d, %xmm15, %xmm15
#if defined(OS_LINUX) | defined(OS_WINDOWS)
	vmovups		.LC00(%rip), %ymm12
#elif defined(OS_MAC)
	vmovups		LC00(%rip), %ymm12
#endif
	vshufps		$0x00, %xmm15, %xmm15, %xmm15
	vinsertf128	$0x1, %xmm15, %ymm15, %ymm15
	vsubps		%ymm15, %ymm12, %ymm15

	movq	%r12, %rax // A1 <- A0
	addq	%r13, %rax // A1 <- A0 + 8*sda*sizeof(float)

	vbroadcastss	0(%r11), %ymm14

	cmpl	$3, %r10d
	jle		0f // consider clean-up

	// main loop
	.p2align 3
1: // main loop

	vmovaps		0(%r12), %ymm0
	vmovaps		0(%rax), %ymm1
	vblendps	$0x07, %ymm1, %ymm0, %ymm0
	vpermilps	$0x93, %ymm0, %ymm0
	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
	vblendps	$0x11, %ymm0, %ymm1, %ymm0
	vmulps		%ymm14, %ymm0, %ymm0
	vmaskmovps	%ymm0, %ymm15, 0(%r14)
	subl		$4, %r10d

	vmovaps		32(%r12), %ymm0
	vmovaps		32(%rax), %ymm1
	vblendps	$0x07, %ymm1, %ymm0, %ymm0
	vpermilps	$0x93, %ymm0, %ymm0
	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
	vblendps	$0x11, %ymm0, %ymm1, %ymm0
	vmulps		%ymm14, %ymm0, %ymm0
	vmaskmovps	%ymm0, %ymm15, 32(%r14)
	addq		$128, %r12
	addq		$128, %rax

	vmovaps		-64(%r12), %ymm0
	vmovaps		-64(%rax), %ymm1
	vblendps	$0x07, %ymm1, %ymm0, %ymm0
	vpermilps	$0x93, %ymm0, %ymm0
	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
	vblendps	$0x11, %ymm0, %ymm1, %ymm0
	vmulps		%ymm14, %ymm0, %ymm0
	vmaskmovps	%ymm0, %ymm15, 64(%r14)
	addq		$128, %r14

	vmovaps		-32(%r12), %ymm0
	vmovaps		-32(%rax), %ymm1
	vblendps	$0x07, %ymm1, %ymm0, %ymm0
	vpermilps	$0x93, %ymm0, %ymm0
	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
	vblendps	$0x11, %ymm0, %ymm1, %ymm0
	vmulps		%ymm14, %ymm0, %ymm0
	vmaskmovps	%ymm0, %ymm15, -32(%r14)

	cmpl		$3, %r10d
	jg			1b // main loop

0: // consider clean-up
	cmpl	$0, %r10d
	jle		2f // return

3: // clean-up loop

	vmovaps		0(%r12), %ymm0
	vmovaps		0(%rax), %ymm1
	vblendps	$0x07, %ymm1, %ymm0, %ymm0
	vpermilps	$0x93, %ymm0, %ymm0
	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
	vblendps	$0x11, %ymm0, %ymm1, %ymm0
	vmulps		%ymm14, %ymm0, %ymm0
	vmaskmovps	%ymm0, %ymm15, 0(%r14)
	subl		$1, %r10d
	addq		$32, %r12
	addq		$32, %rax
	addq		$32, %r14

	cmpl		$0, %r10d
	jg			3b // clean-up loop

2: // return

#if MACRO_LEVEL>=1
	.endm
#else
	ret

#if defined(OS_LINUX)
	.size	inner_kernel_sgecpsc_8_3_gen_lib8, .-inner_kernel_sgecpsc_8_3_gen_lib8
#endif
#endif

// end





// void INNER_KERNEL_SGECP_8_3_GEN_LIB8
// subroutine
//
// input arguments:
// r10d   <- k
// r11    <- A
// r12d   <- 8*sda*sizeof(float)
// r13    <- B
// r14d   <- m1

#if MACRO_LEVEL>=1
	.macro INNER_KERNEL_SGECP_8_3_GEN_LIB8
#else
	.p2align 4,,15
#if defined(OS_LINUX)
	.type inner_kernel_sgecp_8_3_gen_lib8, @function
inner_kernel_sgecp_8_3_gen_lib8:
#elif defined(OS_MAC)
_inner_kernel_sgecp_8_3_gen_lib8:
#elif defined(OS_WINDOWS)
	.def inner_kernel_sgecp_8_3_gen_lib8; .scl 2; .type 32; .endef
inner_kernel_sgecp_8_3_gen_lib8:
#endif
#endif

	// compute mask for rows
	vcvtsi2ss	%r14d, %xmm15, %xmm15
#if defined(OS_LINUX) | defined(OS_WINDOWS)
	vmovups		.LC00(%rip), %ymm12
#elif defined(OS_MAC)
	vmovups		LC00(%rip), %ymm12
#endif
	vshufps		$0x00, %xmm15, %xmm15, %xmm15
	vinsertf128	$0x1, %xmm15, %ymm15, %ymm15
	vsubps		%ymm15, %ymm12, %ymm15

	movq	%r11, %rax // A1 <- A0
	addq	%r12, %rax // A1 <- A0 + 8*sda*sizeof(float)

	cmpl	$3, %r10d
	jle		0f // consider clean-up

	// main loop
	.p2align 3
1: // main loop

	vmovaps		0(%r11), %ymm0
	vmovaps		0(%rax), %ymm1
	vblendps	$0x07, %ymm1, %ymm0, %ymm0
	vpermilps	$0x93, %ymm0, %ymm0
	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
	vblendps	$0x11, %ymm0, %ymm1, %ymm0
	vmaskmovps	%ymm0, %ymm15, 0(%r13)
	subl		$4, %r10d

	vmovaps		32(%r11), %ymm0
	vmovaps		32(%rax), %ymm1
	vblendps	$0x07, %ymm1, %ymm0, %ymm0
	vpermilps	$0x93, %ymm0, %ymm0
	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
	vblendps	$0x11, %ymm0, %ymm1, %ymm0
	vmaskmovps	%ymm0, %ymm15, 32(%r13)
	addq		$128, %r11
	addq		$128, %rax

	vmovaps		-64(%r11), %ymm0
	vmovaps		-64(%rax), %ymm1
	vblendps	$0x07, %ymm1, %ymm0, %ymm0
	vpermilps	$0x93, %ymm0, %ymm0
	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
	vblendps	$0x11, %ymm0, %ymm1, %ymm0
	vmaskmovps	%ymm0, %ymm15, 64(%r13)
	addq		$128, %r13

	vmovaps		-32(%r11), %ymm0
	vmovaps		-32(%rax), %ymm1
	vblendps	$0x07, %ymm1, %ymm0, %ymm0
	vpermilps	$0x93, %ymm0, %ymm0
	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
	vblendps	$0x11, %ymm0, %ymm1, %ymm0
	vmaskmovps	%ymm0, %ymm15, -32(%r13)

	cmpl		$3, %r10d
	jg			1b // main loop

0: // consider clean-up
	cmpl	$0, %r10d
	jle		2f // return

3: // clean-up loop

	vmovaps		0(%r11), %ymm0
	vmovaps		0(%rax), %ymm1
	vblendps	$0x07, %ymm1, %ymm0, %ymm0
	vpermilps	$0x93, %ymm0, %ymm0
	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
	vblendps	$0x11, %ymm0, %ymm1, %ymm0
	vmaskmovps	%ymm0, %ymm15, 0(%r13)
	subl		$1, %r10d
	addq		$32, %r11
	addq		$32, %rax
	addq		$32, %r13

	cmpl		$0, %r10d
	jg			3b // clean-up loop

2: // return

#if MACRO_LEVEL>=1
	.endm
#else
	ret

#if defined(OS_LINUX)
	.size	inner_kernel_sgecp_8_3_gen_lib8, .-inner_kernel_sgecp_8_3_gen_lib8
#endif
#endif

// end





// void INNER_KERNEL_SGECPSC_8_4_GEN_LIB8
//
// input arguments:
// r10d   <- k
// r11    <- alpha
// r12    <- A
// r13d   <- 8*sda*sizeof(float)
// r14    <- B
// r15d   <- m1

#if MACRO_LEVEL>=1
	.macro INNER_KERNEL_SGECPSC_8_4_GEN_LIB8
#else
	.p2align 4,,15
#if defined(OS_LINUX)
	.type inner_kernel_sgecpsc_8_4_gen_lib8, @function
inner_kernel_sgecpsc_8_4_gen_lib8:
#elif defined(OS_MAC)
_inner_kernel_sgecpsc_8_4_gen_lib8:
#elif defined(OS_WINDOWS)
	.def inner_kernel_sgecpsc_8_4_gen_lib8; .scl 2; .type 32; .endef
inner_kernel_sgecpsc_8_4_gen_lib8:
#endif
#endif

	// compute mask for rows
	vcvtsi2ss	%r15d, %xmm15, %xmm15
#if defined(OS_LINUX) | defined(OS_WINDOWS)
	vmovups		.LC00(%rip), %ymm12
#elif defined(OS_MAC)
	vmovups		LC00(%rip), %ymm12
#endif
	vshufps		$0x00, %xmm15, %xmm15, %xmm15
	vinsertf128	$0x1, %xmm15, %ymm15, %ymm15
	vsubps		%ymm15, %ymm12, %ymm15

	movq	%r12, %rax // A1 <- A0
	addq	%r13, %rax // A1 <- A0 + 8*sda*sizeof(float)

	vbroadcastss	0(%r11), %ymm14

	cmpl	$3, %r10d
	jle		0f // consider clean-up

	// main loop
	.p2align 3
1: // main loop

	vmovaps		16(%r12), %xmm0
	vmovaps		0(%rax), %xmm1
	vinsertf128	$0x01, %xmm1, %ymm0, %ymm0
	vmulps		%ymm14, %ymm0, %ymm0
	vmaskmovps	%ymm0, %ymm15, 0(%r14)
	subl		$4, %r10d

	vmovaps		48(%r12), %xmm0
	vmovaps		32(%rax), %xmm1
	vinsertf128	$0x01, %xmm1, %ymm0, %ymm0
	vmulps		%ymm14, %ymm0, %ymm0
	vmaskmovps	%ymm0, %ymm15, 32(%r14)
	addq		$128, %r12

	vmovaps		-48(%r12), %xmm0
	vmovaps		64(%rax), %xmm1
	vinsertf128	$0x01, %xmm1, %ymm0, %ymm0
	vmulps		%ymm14, %ymm0, %ymm0
	vmaskmovps	%ymm0, %ymm15, 64(%r14)
	addq		$128, %rax

	vmovaps		-16(%r12), %xmm0
	vmovaps		-32(%rax), %xmm1
	vinsertf128	$0x01, %xmm1, %ymm0, %ymm0
	vmulps		%ymm14, %ymm0, %ymm0
	vmaskmovps	%ymm0, %ymm15, 96(%r14)
	addq		$128, %r14

	cmpl		$3, %r10d
	jg			1b // main loop

0: // consider clean-up
	cmpl	$0, %r10d
	jle		2f // return

3: // clean-up loop

	vmovaps		16(%r12), %xmm0
	vmovaps		0(%rax), %xmm1
	vinsertf128	$0x01, %xmm1, %ymm0, %ymm0
	vmulps		%ymm14, %ymm0, %ymm0
	vmaskmovps	%ymm0, %ymm15, 0(%r14)
	subl		$1, %r10d
	addq		$32, %r12
	addq		$32, %rax
	addq		$32, %r14

	cmpl		$0, %r10d
	jg			3b // clean-up loop

2: // return

#if MACRO_LEVEL>=1
	.endm
#else
	ret

#if defined(OS_LINUX)
	.size	inner_kernel_sgecpsc_8_4_gen_lib8, .-inner_kernel_sgecpsc_8_4_gen_lib8
#endif
#endif

// end





// void INNER_KERNEL_SGECP_8_4_GEN_LIB8
// subroutine
//
// input arguments:
// r10d   <- k
// r11    <- A
// r12d   <- 8*sda*sizeof(float)
// r13    <- B
// r14d   <- m1

#if MACRO_LEVEL>=1
	.macro INNER_KERNEL_SGECP_8_4_GEN_LIB8
#else
	.p2align 4,,15
#if defined(OS_LINUX)
	.type inner_kernel_sgecp_8_4_gen_lib8, @function
inner_kernel_sgecp_8_4_gen_lib8:
#elif defined(OS_MAC)
_inner_kernel_sgecp_8_4_gen_lib8:
#elif defined(OS_WINDOWS)
	.def inner_kernel_sgecp_8_4_gen_lib8; .scl 2; .type 32; .endef
inner_kernel_sgecp_8_4_gen_lib8:
#endif
#endif

	// compute mask for rows
	vcvtsi2ss	%r14d, %xmm15, %xmm15
#if defined(OS_LINUX) | defined(OS_WINDOWS)
	vmovups		.LC00(%rip), %ymm12
#elif defined(OS_MAC)
	vmovups		LC00(%rip), %ymm12
#endif
	vshufps		$0x00, %xmm15, %xmm15, %xmm15
	vinsertf128	$0x1, %xmm15, %ymm15, %ymm15
	vsubps		%ymm15, %ymm12, %ymm15

	movq	%r11, %rax // A1 <- A0
	addq	%r12, %rax // A1 <- A0 + 8*sda*sizeof(float)

	cmpl	$3, %r10d
	jle		0f // consider clean-up

	// main loop
	.p2align 3
1: // main loop

	vmovaps		16(%r11), %xmm0
	vmovaps		0(%rax), %xmm1
	vinsertf128	$0x01, %xmm1, %ymm0, %ymm0
	vmaskmovps	%ymm0, %ymm15, 0(%r13)
	subl		$4, %r10d

	vmovaps		48(%r11), %xmm0
	vmovaps		32(%rax), %xmm1
	vinsertf128	$0x01, %xmm1, %ymm0, %ymm0
	vmaskmovps	%ymm0, %ymm15, 32(%r13)
	addq		$128, %r11

	vmovaps		-48(%r11), %xmm0
	vmovaps		64(%rax), %xmm1
	vinsertf128	$0x01, %xmm1, %ymm0, %ymm0
	vmaskmovps	%ymm0, %ymm15, 64(%r13)
	addq		$128, %rax

	vmovaps		-16(%r11), %xmm0
	vmovaps		-32(%rax), %xmm1
	vinsertf128	$0x01, %xmm1, %ymm0, %ymm0
	vmaskmovps	%ymm0, %ymm15, 96(%r13)
	addq		$128, %r13

	cmpl		$3, %r10d
	jg			1b // main loop

0: // consider clean-up
	cmpl	$0, %r10d
	jle		2f // return

3: // clean-up loop

	vmovaps		16(%r11), %xmm0
	vmovaps		0(%rax), %xmm1
	vinsertf128	$0x01, %xmm1, %ymm0, %ymm0
	vmaskmovps	%ymm0, %ymm15, 0(%r13)
	subl		$1, %r10d
	addq		$32, %r11
	addq		$32, %rax
	addq		$32, %r13

	cmpl		$0, %r10d
	jg			3b // clean-up loop

2: // return

#if MACRO_LEVEL>=1
	.endm
#else
	ret

#if defined(OS_LINUX)
	.size	inner_kernel_sgecp_8_4_gen_lib8, .-inner_kernel_sgecp_8_4_gen_lib8
#endif
#endif

// end





// void INNER_KERNEL_SGECPSC_8_5_GEN_LIB8
//
// input arguments:
// r10d   <- k
// r11    <- alpha
// r12    <- A
// r13d   <- 8*sda*sizeof(float)
// r14    <- B
// r15d   <- m1

#if MACRO_LEVEL>=1
	.macro INNER_KERNEL_SGECPSC_8_5_GEN_LIB8
#else
	.p2align 4,,15
#if defined(OS_LINUX)
	.type inner_kernel_sgecpsc_8_5_gen_lib8, @function
inner_kernel_sgecpsc_8_5_gen_lib8:
#elif defined(OS_MAC)
_inner_kernel_sgecpsc_8_5_gen_lib8:
#elif defined(OS_WINDOWS)
	.def inner_kernel_sgecpsc_8_5_gen_lib8; .scl 2; .type 32; .endef
inner_kernel_sgecpsc_8_5_gen_lib8:
#endif
#endif

	// compute mask for rows
	vcvtsi2ss	%r15d, %xmm15, %xmm15
#if defined(OS_LINUX) | defined(OS_WINDOWS)
	vmovups		.LC00(%rip), %ymm12
#elif defined(OS_MAC)
	vmovups		LC00(%rip), %ymm12
#endif
	vshufps		$0x00, %xmm15, %xmm15, %xmm15
	vinsertf128	$0x1, %xmm15, %ymm15, %ymm15
	vsubps		%ymm15, %ymm12, %ymm15

	movq	%r12, %rax // A1 <- A0
	addq	%r13, %rax // A1 <- A0 + 8*sda*sizeof(float)

	vbroadcastss	0(%r11), %ymm14

	cmpl	$3, %r10d
	jle		0f // consider clean-up

	// main loop
	.p2align 3
1: // main loop

	vmovaps		0(%r12), %ymm0
	vmovaps		0(%rax), %ymm1
	vblendps	$0x1f, %ymm1, %ymm0, %ymm0
	vpermilps	$0x39, %ymm0, %ymm0
	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
	vblendps	$0x88, %ymm0, %ymm1, %ymm0
	vmulps		%ymm14, %ymm0, %ymm0
	vmaskmovps	%ymm0, %ymm15, 0(%r14)
	subl		$4, %r10d

	vmovaps		32(%r12), %ymm0
	vmovaps		32(%rax), %ymm1
	vblendps	$0x1f, %ymm1, %ymm0, %ymm0
	vpermilps	$0x39, %ymm0, %ymm0
	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
	vblendps	$0x88, %ymm0, %ymm1, %ymm0
	vmulps		%ymm14, %ymm0, %ymm0
	vmaskmovps	%ymm0, %ymm15, 32(%r14)
	addq		$128, %r12
	addq		$128, %rax

	vmovaps		-64(%r12), %ymm0
	vmovaps		-64(%rax), %ymm1
	vblendps	$0x1f, %ymm1, %ymm0, %ymm0
	vpermilps	$0x39, %ymm0, %ymm0
	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
	vblendps	$0x88, %ymm0, %ymm1, %ymm0
	vmulps		%ymm14, %ymm0, %ymm0
	vmaskmovps	%ymm0, %ymm15, 64(%r14)
	addq		$128, %r14

	vmovaps		-32(%r12), %ymm0
	vmovaps		-32(%rax), %ymm1
	vblendps	$0x1f, %ymm1, %ymm0, %ymm0
	vpermilps	$0x39, %ymm0, %ymm0
	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
	vblendps	$0x88, %ymm0, %ymm1, %ymm0
	vmulps		%ymm14, %ymm0, %ymm0
	vmaskmovps	%ymm0, %ymm15, -32(%r14)

	cmpl		$3, %r10d
	jg			1b // main loop

0: // consider clean-up
	cmpl	$0, %r10d
	jle		2f // return

3: // clean-up loop

	vmovaps		0(%r12), %ymm0
	vmovaps		0(%rax), %ymm1
	vblendps	$0x1f, %ymm1, %ymm0, %ymm0
	vpermilps	$0x39, %ymm0, %ymm0
	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
	vblendps	$0x88, %ymm0, %ymm1, %ymm0
	vmulps		%ymm14, %ymm0, %ymm0
	vmaskmovps	%ymm0, %ymm15, 0(%r14)
	subl		$1, %r10d
	addq		$32, %r12
	addq		$32, %rax
	addq		$32, %r14

	cmpl		$0, %r10d
	jg			3b // clean-up loop

2: // return

#if MACRO_LEVEL>=1
	.endm
#else
	ret

#if defined(OS_LINUX)
	.size	inner_kernel_sgecpsc_8_5_gen_lib8, .-inner_kernel_sgecpsc_8_5_gen_lib8
#endif
#endif

// end





// void INNER_KERNEL_SGECP_8_5_GEN_LIB8
// subroutine
//
// input arguments:
// r10d   <- k
// r11    <- A
// r12d   <- 8*sda*sizeof(float)
// r13    <- B
// r14d   <- m1

#if MACRO_LEVEL>=1
	.macro INNER_KERNEL_SGECP_8_5_GEN_LIB8
#else
	.p2align 4,,15
#if defined(OS_LINUX)
	.type inner_kernel_sgecp_8_5_gen_lib8, @function
inner_kernel_sgecp_8_5_gen_lib8:
#elif defined(OS_MAC)
_inner_kernel_sgecp_8_5_gen_lib8:
#elif defined(OS_WINDOWS)
	.def inner_kernel_sgecp_8_5_gen_lib8; .scl 2; .type 32; .endef
inner_kernel_sgecp_8_5_gen_lib8:
#endif
#endif

	// compute mask for rows
	vcvtsi2ss	%r14d, %xmm15, %xmm15
#if defined(OS_LINUX) | defined(OS_WINDOWS)
	vmovups		.LC00(%rip), %ymm12
#elif defined(OS_MAC)
	vmovups		LC00(%rip), %ymm12
#endif
	vshufps		$0x00, %xmm15, %xmm15, %xmm15
	vinsertf128	$0x1, %xmm15, %ymm15, %ymm15
	vsubps		%ymm15, %ymm12, %ymm15

	movq	%r11, %rax // A1 <- A0
	addq	%r12, %rax // A1 <- A0 + 8*sda*sizeof(float)

	cmpl	$3, %r10d
	jle		0f // consider clean-up

	// main loop
	.p2align 3
1: // main loop

	vmovaps		0(%r11), %ymm0
	vmovaps		0(%rax), %ymm1
	vblendps	$0x1f, %ymm1, %ymm0, %ymm0
	vpermilps	$0x39, %ymm0, %ymm0
	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
	vblendps	$0x88, %ymm0, %ymm1, %ymm0
	vmaskmovps	%ymm0, %ymm15, 0(%r13)
	subl		$4, %r10d

	vmovaps		32(%r11), %ymm0
	vmovaps		32(%rax), %ymm1
	vblendps	$0x1f, %ymm1, %ymm0, %ymm0
	vpermilps	$0x39, %ymm0, %ymm0
	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
	vblendps	$0x88, %ymm0, %ymm1, %ymm0
	vmaskmovps	%ymm0, %ymm15, 32(%r13)
	addq		$128, %r11
	addq		$128, %rax

	vmovaps		-64(%r11), %ymm0
	vmovaps		-64(%rax), %ymm1
	vblendps	$0x1f, %ymm1, %ymm0, %ymm0
	vpermilps	$0x39, %ymm0, %ymm0
	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
	vblendps	$0x88, %ymm0, %ymm1, %ymm0
	vmaskmovps	%ymm0, %ymm15, 64(%r13)
	addq		$128, %r13

	vmovaps		-32(%r11), %ymm0
	vmovaps		-32(%rax), %ymm1
	vblendps	$0x1f, %ymm1, %ymm0, %ymm0
	vpermilps	$0x39, %ymm0, %ymm0
	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
	vblendps	$0x88, %ymm0, %ymm1, %ymm0
	vmaskmovps	%ymm0, %ymm15, -32(%r13)

	cmpl		$3, %r10d
	jg			1b // main loop

0: // consider clean-up
	cmpl	$0, %r10d
	jle		2f // return

3: // clean-up loop

	vmovaps		0(%r11), %ymm0
	vmovaps		0(%rax), %ymm1
	vblendps	$0x1f, %ymm1, %ymm0, %ymm0
	vpermilps	$0x39, %ymm0, %ymm0
	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
	vblendps	$0x88, %ymm0, %ymm1, %ymm0
	vmaskmovps	%ymm0, %ymm15, 0(%r13)
	subl		$1, %r10d
	addq		$32, %r11
	addq		$32, %rax
	addq		$32, %r13

	cmpl		$0, %r10d
	jg			3b // clean-up loop

2: // return

#if MACRO_LEVEL>=1
	.endm
#else
	ret

#if defined(OS_LINUX)
	.size	inner_kernel_sgecp_8_5_gen_lib8, .-inner_kernel_sgecp_8_5_gen_lib8
#endif
#endif

// end





// void INNER_KERNEL_SGECPSC_8_6_GEN_LIB8
//
// input arguments:
// r10d   <- k
// r11    <- alpha
// r12    <- A
// r13d   <- 8*sda*sizeof(float)
// r14    <- B
// r15d   <- m1

#if MACRO_LEVEL>=1
	.macro INNER_KERNEL_SGECPSC_8_6_GEN_LIB8
#else
	.p2align 4,,15
#if defined(OS_LINUX)
	.type inner_kernel_sgecpsc_8_6_gen_lib8, @function
inner_kernel_sgecpsc_8_6_gen_lib8:
#elif defined(OS_MAC)
_inner_kernel_sgecpsc_8_6_gen_lib8:
#elif defined(OS_WINDOWS)
	.def inner_kernel_sgecpsc_8_6_gen_lib8; .scl 2; .type 32; .endef
inner_kernel_sgecpsc_8_6_gen_lib8:
#endif
#endif

	// compute mask for rows
	vcvtsi2ss	%r15d, %xmm15, %xmm15
#if defined(OS_LINUX) | defined(OS_WINDOWS)
	vmovups		.LC00(%rip), %ymm12
#elif defined(OS_MAC)
	vmovups		LC00(%rip), %ymm12
#endif
	vshufps		$0x00, %xmm15, %xmm15, %xmm15
	vinsertf128	$0x1, %xmm15, %ymm15, %ymm15
	vsubps		%ymm15, %ymm12, %ymm15

	movq	%r12, %rax // A1 <- A0
	addq	%r13, %rax // A1 <- A0 + 8*sda*sizeof(float)

	vbroadcastss	0(%r11), %ymm14

	cmpl	$3, %r10d
	jle		0f // consider clean-up

	// main loop
	.p2align 3
1: // main loop

	vmovaps		0(%r12), %ymm0
	vmovaps		0(%rax), %ymm1
	vblendps	$0x3f, %ymm1, %ymm0, %ymm0
	vpermilps	$0x4e, %ymm0, %ymm0
	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
	vblendps	$0xcc, %ymm0, %ymm1, %ymm0
	vmulps		%ymm14, %ymm0, %ymm0
	vmaskmovps	%ymm0, %ymm15, 0(%r14)
	subl		$4, %r10d

	vmovaps		32(%r12), %ymm0
	vmovaps		32(%rax), %ymm1
	vblendps	$0x3f, %ymm1, %ymm0, %ymm0
	vpermilps	$0x4e, %ymm0, %ymm0
	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
	vblendps	$0xcc, %ymm0, %ymm1, %ymm0
	vmulps		%ymm14, %ymm0, %ymm0
	vmaskmovps	%ymm0, %ymm15, 32(%r14)
	addq		$128, %r12
	addq		$128, %rax

	vmovaps		-64(%r12), %ymm0
	vmovaps		-64(%rax), %ymm1
	vblendps	$0x3f, %ymm1, %ymm0, %ymm0
	vpermilps	$0x4e, %ymm0, %ymm0
	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
	vblendps	$0xcc, %ymm0, %ymm1, %ymm0
	vmulps		%ymm14, %ymm0, %ymm0
	vmaskmovps	%ymm0, %ymm15, 64(%r14)
	addq		$128, %r14

	vmovaps		-32(%r12), %ymm0
	vmovaps		-32(%rax), %ymm1
	vblendps	$0x3f, %ymm1, %ymm0, %ymm0
	vpermilps	$0x4e, %ymm0, %ymm0
	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
	vblendps	$0xcc, %ymm0, %ymm1, %ymm0
	vmulps		%ymm14, %ymm0, %ymm0
	vmaskmovps	%ymm0, %ymm15, -32(%r14)

	cmpl		$3, %r10d
	jg			1b // main loop

0: // consider clean-up
	cmpl	$0, %r10d
	jle		2f // return

3: // clean-up loop

	vmovaps		0(%r12), %ymm0
	vmovaps		0(%rax), %ymm1
	vblendps	$0x3f, %ymm1, %ymm0, %ymm0
	vpermilps	$0x4e, %ymm0, %ymm0
	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
	vblendps	$0xcc, %ymm0, %ymm1, %ymm0
	vmulps		%ymm14, %ymm0, %ymm0
	vmaskmovps	%ymm0, %ymm15, 0(%r14)
	subl		$1, %r10d
	addq		$32, %r12
	addq		$32, %rax
	addq		$32, %r14

	cmpl		$0, %r10d
	jg			3b // clean-up loop

2: // return

#if MACRO_LEVEL>=1
	.endm
#else
	ret

#if defined(OS_LINUX)
	.size	inner_kernel_sgecpsc_8_6_gen_lib8, .-inner_kernel_sgecpsc_8_6_gen_lib8
#endif
#endif

// end





// void INNER_KERNEL_SGECP_8_6_GEN_LIB8
// subroutine
//
// input arguments:
// r10d   <- k
// r11    <- A
// r12d   <- 8*sda*sizeof(float)
// r13    <- B
// r14d   <- m1

#if MACRO_LEVEL>=1
	.macro INNER_KERNEL_SGECP_8_6_GEN_LIB8
#else
	.p2align 4,,15
#if defined(OS_LINUX)
	.type inner_kernel_sgecp_8_6_gen_lib8, @function
inner_kernel_sgecp_8_6_gen_lib8:
#elif defined(OS_MAC)
_inner_kernel_sgecp_8_6_gen_lib8:
#elif defined(OS_WINDOWS)
	.def inner_kernel_sgecp_8_6_gen_lib8; .scl 2; .type 32; .endef
inner_kernel_sgecp_8_6_gen_lib8:
#endif
#endif

	// compute mask for rows
	vcvtsi2ss	%r14d, %xmm15, %xmm15
#if defined(OS_LINUX) | defined(OS_WINDOWS)
	vmovups		.LC00(%rip), %ymm12
#elif defined(OS_MAC)
	vmovups		LC00(%rip), %ymm12
#endif
	vshufps		$0x00, %xmm15, %xmm15, %xmm15
	vinsertf128	$0x1, %xmm15, %ymm15, %ymm15
	vsubps		%ymm15, %ymm12, %ymm15

	movq	%r11, %rax // A1 <- A0
	addq	%r12, %rax // A1 <- A0 + 8*sda*sizeof(float)

	cmpl	$3, %r10d
	jle		0f // consider clean-up

	// main loop
	.p2align 3
1: // main loop

	vmovaps		0(%r11), %ymm0
	vmovaps		0(%rax), %ymm1
	vblendps	$0x3f, %ymm1, %ymm0, %ymm0
	vpermilps	$0x4e, %ymm0, %ymm0
	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
	vblendps	$0xcc, %ymm0, %ymm1, %ymm0
	vmaskmovps	%ymm0, %ymm15, 0(%r13)
	subl		$4, %r10d

	vmovaps		32(%r11), %ymm0
	vmovaps		32(%rax), %ymm1
	vblendps	$0x3f, %ymm1, %ymm0, %ymm0
	vpermilps	$0x4e, %ymm0, %ymm0
	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
	vblendps	$0xcc, %ymm0, %ymm1, %ymm0
	vmaskmovps	%ymm0, %ymm15, 32(%r13)
	addq		$128, %r11
	addq		$128, %rax

	vmovaps		-64(%r11), %ymm0
	vmovaps		-64(%rax), %ymm1
	vblendps	$0x3f, %ymm1, %ymm0, %ymm0
	vpermilps	$0x4e, %ymm0, %ymm0
	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
	vblendps	$0xcc, %ymm0, %ymm1, %ymm0
	vmaskmovps	%ymm0, %ymm15, 64(%r13)
	addq		$128, %r13

	vmovaps		-32(%r11), %ymm0
	vmovaps		-32(%rax), %ymm1
	vblendps	$0x3f, %ymm1, %ymm0, %ymm0
	vpermilps	$0x4e, %ymm0, %ymm0
	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
	vblendps	$0xcc, %ymm0, %ymm1, %ymm0
	vmaskmovps	%ymm0, %ymm15, -32(%r13)

	cmpl		$3, %r10d
	jg			1b // main loop

0: // consider clean-up
	cmpl	$0, %r10d
	jle		2f // return

3: // clean-up loop

	vmovaps		0(%r11), %ymm0
	vmovaps		0(%rax), %ymm1
	vblendps	$0x3f, %ymm1, %ymm0, %ymm0
	vpermilps	$0x4e, %ymm0, %ymm0
	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
	vblendps	$0xcc, %ymm0, %ymm1, %ymm0
	vmaskmovps	%ymm0, %ymm15, 0(%r13)
	subl		$1, %r10d
	addq		$32, %r11
	addq		$32, %rax
	addq		$32, %r13

	cmpl		$0, %r10d
	jg			3b // clean-up loop

2: // return

#if MACRO_LEVEL>=1
	.endm
#else
	ret

#if defined(OS_LINUX)
	.size	inner_kernel_sgecp_8_6_gen_lib8, .-inner_kernel_sgecp_8_6_gen_lib8
#endif
#endif

// end





// void INNER_KERNEL_SGECPSC_8_7_GEN_LIB8
//
// input arguments:
// r10d   <- k
// r11    <- alpha
// r12    <- A
// r13d   <- 8*sda*sizeof(float)
// r14    <- B
// r15d   <- m1

#if MACRO_LEVEL>=1
	.macro INNER_KERNEL_SGECPSC_8_7_GEN_LIB8
#else
	.p2align 4,,15
#if defined(OS_LINUX)
	.type inner_kernel_sgecpsc_8_7_gen_lib8, @function
inner_kernel_sgecpsc_8_7_gen_lib8:
#elif defined(OS_MAC)
_inner_kernel_sgecpsc_8_7_gen_lib8:
#elif defined(OS_WINDOWS)
	.def inner_kernel_sgecpsc_8_7_gen_lib8; .scl 2; .type 32; .endef
inner_kernel_sgecpsc_8_7_gen_lib8:
#endif
#endif

	// compute mask for rows
	vcvtsi2ss	%r15d, %xmm15, %xmm15
#if defined(OS_LINUX) | defined(OS_WINDOWS)
	vmovups		.LC00(%rip), %ymm12
#elif defined(OS_MAC)
	vmovups		LC00(%rip), %ymm12
#endif
	vshufps		$0x00, %xmm15, %xmm15, %xmm15
	vinsertf128	$0x1, %xmm15, %ymm15, %ymm15
	vsubps		%ymm15, %ymm12, %ymm15

	movq	%r12, %rax // A1 <- A0
	addq	%r13, %rax // A1 <- A0 + 8*sda*sizeof(float)

	vbroadcastss	0(%r11), %ymm14

	cmpl	$3, %r10d
	jle		0f // consider clean-up

	// main loop
	.p2align 3
1: // main loop

	vmovaps		0(%r12), %ymm0
	vmovaps		0(%rax), %ymm1
	vblendps	$0x7f, %ymm1, %ymm0, %ymm0
	vpermilps	$0x93, %ymm0, %ymm0
	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
	vblendps	$0xee, %ymm0, %ymm1, %ymm0
	vmulps		%ymm14, %ymm0, %ymm0
	vmaskmovps	%ymm0, %ymm15, 0(%r14)
	subl		$4, %r10d

	vmovaps		32(%r12), %ymm0
	vmovaps		32(%rax), %ymm1
	vblendps	$0x7f, %ymm1, %ymm0, %ymm0
	vpermilps	$0x93, %ymm0, %ymm0
	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
	vblendps	$0xee, %ymm0, %ymm1, %ymm0
	vmulps		%ymm14, %ymm0, %ymm0
	vmaskmovps	%ymm0, %ymm15, 32(%r14)
	addq		$128, %r12
	addq		$128, %rax

	vmovaps		-64(%r12), %ymm0
	vmovaps		-64(%rax), %ymm1
	vblendps	$0x7f, %ymm1, %ymm0, %ymm0
	vpermilps	$0x93, %ymm0, %ymm0
	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
	vblendps	$0xee, %ymm0, %ymm1, %ymm0
	vmulps		%ymm14, %ymm0, %ymm0
	vmaskmovps	%ymm0, %ymm15, 64(%r14)
	addq		$128, %r14

	vmovaps		-32(%r12), %ymm0
	vmovaps		-32(%rax), %ymm1
	vblendps	$0x7f, %ymm1, %ymm0, %ymm0
	vpermilps	$0x93, %ymm0, %ymm0
	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
	vblendps	$0xee, %ymm0, %ymm1, %ymm0
	vmulps		%ymm14, %ymm0, %ymm0
	vmaskmovps	%ymm0, %ymm15, -32(%r14)

	cmpl		$3, %r10d
	jg			1b // main loop

0: // consider clean-up
	cmpl	$0, %r10d
	jle		2f // return

3: // clean-up loop

	vmovaps		0(%r12), %ymm0
	vmovaps		0(%rax), %ymm1
	vblendps	$0x7f, %ymm1, %ymm0, %ymm0
	vpermilps	$0x93, %ymm0, %ymm0
	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
	vblendps	$0xee, %ymm0, %ymm1, %ymm0
	vmulps		%ymm14, %ymm0, %ymm0
	vmaskmovps	%ymm0, %ymm15, 0(%r14)
	subl		$1, %r10d
	addq		$32, %r12
	addq		$32, %rax
	addq		$32, %r14

	cmpl		$0, %r10d
	jg			3b // clean-up loop

2: // return

#if MACRO_LEVEL>=1
	.endm
#else
	ret

#if defined(OS_LINUX)
	.size	inner_kernel_sgecpsc_8_7_gen_lib8, .-inner_kernel_sgecpsc_8_7_gen_lib8
#endif
#endif

// end





// void INNER_KERNEL_SGECP_8_7_GEN_LIB8
// subroutine
//
// input arguments:
// r10d   <- k
// r11    <- A
// r12d   <- 8*sda*sizeof(float)
// r13    <- B
// r14d   <- m1

#if MACRO_LEVEL>=1
	.macro INNER_KERNEL_SGECP_8_7_GEN_LIB8
#else
	.p2align 4,,15
#if defined(OS_LINUX)
	.type inner_kernel_sgecp_8_7_gen_lib8, @function
inner_kernel_sgecp_8_7_gen_lib8:
#elif defined(OS_MAC)
_inner_kernel_sgecp_8_7_gen_lib8:
#elif defined(OS_WINDOWS)
	.def inner_kernel_sgecp_8_7_gen_lib8; .scl 2; .type 32; .endef
inner_kernel_sgecp_8_7_gen_lib8:
#endif
#endif

	// compute mask for rows
	vcvtsi2ss	%r14d, %xmm15, %xmm15
#if defined(OS_LINUX) | defined(OS_WINDOWS)
	vmovups		.LC00(%rip), %ymm12
#elif defined(OS_MAC)
	vmovups		LC00(%rip), %ymm12
#endif
	vshufps		$0x00, %xmm15, %xmm15, %xmm15
	vinsertf128	$0x1, %xmm15, %ymm15, %ymm15
	vsubps		%ymm15, %ymm12, %ymm15

	movq	%r11, %rax // A1 <- A0
	addq	%r12, %rax // A1 <- A0 + 8*sda*sizeof(float)

	cmpl	$3, %r10d
	jle		0f // consider clean-up

	// main loop
	.p2align 3
1: // main loop

	vmovaps		0(%r11), %ymm0
	vmovaps		0(%rax), %ymm1
	vblendps	$0x7f, %ymm1, %ymm0, %ymm0
	vpermilps	$0x93, %ymm0, %ymm0
	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
	vblendps	$0xee, %ymm0, %ymm1, %ymm0
	vmaskmovps	%ymm0, %ymm15, 0(%r13)
	subl		$4, %r10d

	vmovaps		32(%r11), %ymm0
	vmovaps		32(%rax), %ymm1
	vblendps	$0x7f, %ymm1, %ymm0, %ymm0
	vpermilps	$0x93, %ymm0, %ymm0
	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
	vblendps	$0xee, %ymm0, %ymm1, %ymm0
	vmaskmovps	%ymm0, %ymm15, 32(%r13)
	addq		$128, %r11
	addq		$128, %rax

	vmovaps		-64(%r11), %ymm0
	vmovaps		-64(%rax), %ymm1
	vblendps	$0x7f, %ymm1, %ymm0, %ymm0
	vpermilps	$0x93, %ymm0, %ymm0
	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
	vblendps	$0xee, %ymm0, %ymm1, %ymm0
	vmaskmovps	%ymm0, %ymm15, 64(%r13)
	addq		$128, %r13

	vmovaps		-32(%r11), %ymm0
	vmovaps		-32(%rax), %ymm1
	vblendps	$0x7f, %ymm1, %ymm0, %ymm0
	vpermilps	$0x93, %ymm0, %ymm0
	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
	vblendps	$0xee, %ymm0, %ymm1, %ymm0
	vmaskmovps	%ymm0, %ymm15, -32(%r13)

	cmpl		$3, %r10d
	jg			1b // main loop

0: // consider clean-up
	cmpl	$0, %r10d
	jle		2f // return

3: // clean-up loop

	vmovaps		0(%r11), %ymm0
	vmovaps		0(%rax), %ymm1
	vblendps	$0x7f, %ymm1, %ymm0, %ymm0
	vpermilps	$0x93, %ymm0, %ymm0
	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
	vblendps	$0xee, %ymm0, %ymm1, %ymm0
	vmaskmovps	%ymm0, %ymm15, 0(%r13)
	subl		$1, %r10d
	addq		$32, %r11
	addq		$32, %rax
	addq		$32, %r13

	cmpl		$0, %r10d
	jg			3b // clean-up loop

2: // return

#if MACRO_LEVEL>=1
	.endm
#else
	ret

#if defined(OS_LINUX)
	.size	inner_kernel_sgecp_8_7_gen_lib8, .-inner_kernel_sgecp_8_7_gen_lib8
#endif
#endif

// end





// --- Public functions

// All 8 rows

// src matrix aligned

// void kernel_sgecpsc_8_0_lib8(int k, float *alpha, float *A, float *B);
//                              1      2             3         4
	.p2align 4,,15
#if defined(OS_LINUX)
	.globl kernel_sgecpsc_8_0_lib8
	.type kernel_sgecpsc_8_0_lib8, @function
kernel_sgecpsc_8_0_lib8:
#elif defined(OS_MAC)
	.globl _kernel_sgecpsc_8_0_lib8
_kernel_sgecpsc_8_0_lib8:
#elif defined(OS_WINDOWS)
	.globl kernel_sgecpsc_8_0_lib8
	.def kernel_sgecpsc_8_0_lib8; .scl 2; .type 32; .endef
kernel_sgecpsc_8_0_lib8:
#endif

	PROLOGUE

	// call inner dgemm kernel nt

	movq	ARG1, %r10 // k
	movq	ARG2, %r11 // alpha
	movq	ARG3, %r12 // A
	movq	ARG4, %r13 // B

#if MACRO_LEVEL>=1
	INNER_KERNEL_SGECPSC_8_0_LIB8
#else
#if defined(OS_LINUX) | defined(OS_WINDOWS)
	call inner_kernel_sgecpsc_8_0_lib8
#elif defined(OS_MAC)
	callq _inner_kernel_sgecpsc_8_0_lib8
#endif
#endif

	EPILOGUE

	ret

#if defined(OS_LINUX)
	.size	kernel_sgecpsc_8_0_lib8, .-kernel_sgecpsc_8_0_lib8
#endif
//
// end public fun





// void kernel_sgecp_8_0_lib8(int k, float *A, float *B);
//                            1      2         3
	.p2align 4,,15
#if defined(OS_LINUX)
	.globl kernel_sgecp_8_0_lib8
	.type kernel_sgecp_8_0_lib8, @function
kernel_sgecp_8_0_lib8:
#elif defined(OS_MAC)
	.globl _kernel_sgecp_8_0_lib8
_kernel_sgecp_8_0_lib8:
#elif defined(OS_WINDOWS)
	.globl kernel_sgecp_8_0_lib8
	.def kernel_sgecp_8_0_lib8; .scl 2; .type 32; .endef
kernel_sgecp_8_0_lib8:
#endif

	PROLOGUE

	// call inner dgemm kernel nt

	movq	ARG1, %r10 // k
	movq	ARG2, %r11 // A
	movq	ARG3, %r12 // B

#if MACRO_LEVEL>=1
	INNER_KERNEL_SGECP_8_0_LIB8
#else
#if defined(OS_LINUX) | defined(OS_WINDOWS)
	call inner_kernel_sgecp_8_0_lib8
#elif defined(OS_MAC)
	callq _inner_kernel_sgecp_8_0_lib8
#endif
#endif

	EPILOGUE

	ret

#if defined(OS_LINUX)
	.size	kernel_sgecp_8_0_lib8, .-kernel_sgecp_8_0_lib8
#endif
//
// end





// void kernel_sgesc_8_0_lib8(int k, float *alpha, float *A);
//                            1      2             3
	.p2align 4,,15
#if defined(OS_LINUX)
	.globl kernel_sgesc_8_0_lib8
	.type kernel_sgesc_8_0_lib8, @function
kernel_sgesc_8_0_lib8:
#elif defined(OS_MAC)
	.globl _kernel_sgesc_8_0_lib8
_kernel_sgesc_8_0_lib8:
#elif defined(OS_WINDOWS)
	.globl kernel_sgesc_8_0_lib8
	.def kernel_sgesc_8_0_lib8; .scl 2; .type 32; .endef
kernel_sgesc_8_0_lib8:
#endif

	PROLOGUE

	// call inner dgemm kernel nt

	movq	ARG1, %r10 // k
	movq	ARG2, %r11 // alpha
	movq	ARG3, %r12 // A
	movq	ARG3, %r13 // A

#if MACRO_LEVEL>=1
	INNER_KERNEL_SGECPSC_8_0_LIB8
#else
#if defined(OS_LINUX) | defined(OS_WINDOWS)
	call inner_kernel_sgecpsc_8_0_lib8
#elif defined(OS_MAC)
	callq _inner_kernel_sgecpsc_8_0_lib8
#endif
#endif

	EPILOGUE

	ret

#if defined(OS_LINUX)
	.size	kernel_sgesc_8_0_lib8, .-kernel_sgesc_8_0_lib8
#endif
//
// end public fun




// src matrix not aligned

// void kernel_sgecpsc_8_1_lib8(int k, float alpha, float *A, int sda, float *B);
//                              1      2            3         4        5

	.p2align 4,,15
#if defined(OS_LINUX)
	.globl kernel_sgecpsc_8_1_lib8
	.type kernel_sgecpsc_8_1_lib8, @function
kernel_sgecpsc_8_1_lib8:
#elif defined(OS_MAC)
	.globl _kernel_sgecpsc_8_1_lib8
_kernel_sgecpsc_8_1_lib8:
#elif defined(OS_WINDOWS)
	.globl kernel_sgecpsc_8_1_lib8
	.def kernel_sgecpsc_8_1_lib8; .scl 2; .type 32; .endef
kernel_sgecpsc_8_1_lib8:
#endif

	PROLOGUE

	// call inner dgemm kernel nt

	movq	ARG1, %r10 // k
	movq	ARG2, %r11 // float
	movq	ARG3, %r12 // A

	movq	ARG4, %r13 // sda
	sall	$5, %r13d  // sda *= 32
	movq	ARG5, %r14 // B

#if MACRO_LEVEL>=1
	INNER_KERNEL_SGECPSC_8_1_LIB8
#else
#if defined(OS_LINUX) | defined(OS_WINDOWS)
	call inner_kernel_sgecpsc_8_1_lib8
#elif defined(OS_MAC)
	callq _inner_kernel_sgecpsc_8_1_lib8
#endif
#endif

	EPILOGUE

	ret

#if defined(OS_LINUX)
	.size	kernel_sgecpsc_8_1_lib8, .-kernel_sgecp_8_1_lib8
#endif

// end





// void kernel_sgecp_8_1_lib8(int k, float *A, int sda, float *B);
//                            1      2         3        4

	.p2align 4,,15
#if defined(OS_LINUX)
	.globl kernel_sgecp_8_1_lib8
	.type kernel_sgecp_8_1_lib8, @function
kernel_sgecp_8_1_lib8:
#elif defined(OS_MAC)
	.globl _kernel_sgecp_8_1_lib8
_kernel_sgecp_8_1_lib8:
#elif defined(OS_WINDOWS)
	.globl kernel_sgecp_8_1_lib8
	.def kernel_sgecp_8_1_lib8; .scl 2; .type 32; .endef
kernel_sgecp_8_1_lib8:
#endif

	PROLOGUE

	// call inner dgemm kernel nt

	movq	ARG1, %r10 // k
	movq	ARG2, %r11 // A
	movq	ARG3, %r12 // sda
	sall	$5, %r12d
	movq	ARG4, %r13 // B

#if MACRO_LEVEL>=1
	INNER_KERNEL_SGECP_8_1_LIB8
#else
#if defined(OS_LINUX) | defined(OS_WINDOWS)
	call inner_kernel_sgecp_8_1_lib8
#elif defined(OS_MAC)
	callq _inner_kernel_sgecp_8_1_lib8
#endif
#endif

	EPILOGUE

	ret

#if defined(OS_LINUX)
	.size	kernel_sgecp_8_1_lib8, .-kernel_sgecp_8_1_lib8
#endif

// end





// void kernel_sgecpsc_8_2_lib8(int k, float alpha, float *A, int sda, float *B);
//                              1      2            3         4        5

	.p2align 4,,15
#if defined(OS_LINUX)
	.globl kernel_sgecpsc_8_2_lib8
	.type kernel_sgecpsc_8_2_lib8, @function
kernel_sgecpsc_8_2_lib8:
#elif defined(OS_MAC)
	.globl _kernel_sgecpsc_8_2_lib8
_kernel_sgecpsc_8_2_lib8:
#elif defined(OS_WINDOWS)
	.globl kernel_sgecpsc_8_2_lib8
	.def kernel_sgecpsc_8_2_lib8; .scl 2; .type 32; .endef
kernel_sgecpsc_8_2_lib8:
#endif

	PROLOGUE

	// call inner dgemm kernel nt

	movq	ARG1, %r10 // k
	movq	ARG2, %r11 // float
	movq	ARG3, %r12 // A

	movq	ARG4, %r13 // sda
	sall	$5, %r13d  // sda *= 32~~8*sizeof(float)

	movq	ARG5, %r14 // B

#if MACRO_LEVEL>=1
	INNER_KERNEL_SGECPSC_8_2_LIB8
#else
#if defined(OS_LINUX) | defined(OS_WINDOWS)
	call inner_kernel_sgecpsc_8_2_lib8
#elif defined(OS_MAC)
	callq _inner_kernel_sgecpsc_8_2_lib8
#endif
#endif

	EPILOGUE

	ret

#if defined(OS_LINUX)
	.size	kernel_sgecpsc_8_1_lib8, .-kernel_sgecp_8_1_lib8
#endif

// end




// void kernel_sgecp_8_2_lib8(int k, float *A, int sda, float *B);
//                            1      2         3        4

	.p2align 4,,15
#if defined(OS_LINUX)
	.globl kernel_sgecp_8_2_lib8
	.type kernel_sgecp_8_2_lib8, @function
kernel_sgecp_8_2_lib8:
#elif defined(OS_MAC)
	.globl _kernel_sgecp_8_2_lib8
_kernel_sgecp_8_2_lib8:
#elif defined(OS_WINDOWS)
	.globl kernel_sgecp_8_2_lib8
	.def kernel_sgecp_8_2_lib8; .scl 2; .type 32; .endef
kernel_sgecp_8_2_lib8:
#endif

	PROLOGUE

	// call inner dgemm kernel nt

	movq	ARG1, %r10 // k
	movq	ARG2, %r11  // A
	movq	ARG3, %r12  // 8*sda*sizeof(float)
	sall	$5, %r12d
	movq	ARG4, %r13  // B

#if MACRO_LEVEL>=1
	INNER_KERNEL_SGECP_8_2_LIB8
#else
#if defined(OS_LINUX) | defined(OS_WINDOWS)
	call inner_kernel_sgecp_8_2_lib8
#elif defined(OS_MAC)
	callq _inner_kernel_sgecp_8_2_lib8
#endif
#endif

	EPILOGUE

	ret

#if defined(OS_LINUX)
	.size	kernel_sgecp_8_2_lib8, .-kernel_sgecp_8_2_lib8
#endif

// end





// void kernel_sgecpsc_8_3_lib8(int k, float alpha, float *A, int sda, float *B);
//                              1      2            3         4        5

	.p2align 4,,15
#if defined(OS_LINUX)
	.globl kernel_sgecpsc_8_3_lib8
	.type kernel_sgecpsc_8_3_lib8, @function
kernel_sgecpsc_8_3_lib8:
#elif defined(OS_MAC)
	.globl _kernel_sgecpsc_8_3_lib8
_kernel_sgecpsc_8_3_lib8:
#elif defined(OS_WINDOWS)
	.globl kernel_sgecpsc_8_3_lib8
	.def kernel_sgecpsc_8_3_lib8; .scl 2; .type 32; .endef
kernel_sgecpsc_8_3_lib8:
#endif

	PROLOGUE

	// call inner dgemm kernel nt

	movq	ARG1, %r10 // k
	movq	ARG2, %r11 // float
	movq	ARG3, %r12 // A

	movq	ARG4, %r13 // sda
	sall	$5, %r13d  // sda *= 32
	movq	ARG5, %r14 // B

#if MACRO_LEVEL>=1
	INNER_KERNEL_SGECPSC_8_3_LIB8
#else
#if defined(OS_LINUX) | defined(OS_WINDOWS)
	call inner_kernel_sgecpsc_8_3_lib8
#elif defined(OS_MAC)
	callq _inner_kernel_sgecpsc_8_3_lib8
#endif
#endif

	EPILOGUE

	ret

#if defined(OS_LINUX)
	.size	kernel_sgecpsc_8_3_lib8, .-kernel_sgecp_8_3_lib8
#endif

// end





// void kernel_sgecp_8_3_lib8(int k, float *A, int sda, float *B);
//                            1      2         3        4

	.p2align 4,,15
#if defined(OS_LINUX)
	.globl kernel_sgecp_8_3_lib8
	.type kernel_sgecp_8_3_lib8, @function
kernel_sgecp_8_3_lib8:
#elif defined(OS_MAC)
	.globl _kernel_sgecp_8_3_lib8
_kernel_sgecp_8_3_lib8:
#elif defined(OS_WINDOWS)
	.globl kernel_sgecp_8_3_lib8
	.def kernel_sgecp_8_3_lib8; .scl 2; .type 32; .endef
kernel_sgecp_8_3_lib8:
#endif

	PROLOGUE

	// call inner dgemm kernel nt

	movq	ARG1, %r10 // k
	movq	ARG2, %r11 // A
	movq	ARG3, %r12 // sda
	sall	$5, %r12d  // 8*sda*sizeof(float)
	movq	ARG4, %r13 // B

#if MACRO_LEVEL>=1
	INNER_KERNEL_SGECP_8_3_LIB8
#else
#if defined(OS_LINUX) | defined(OS_WINDOWS)
	call inner_kernel_sgecp_8_3_lib8
#elif defined(OS_MAC)
	callq _inner_kernel_sgecp_8_3_lib8
#endif
#endif

	EPILOGUE

	ret

#if defined(OS_LINUX)
	.size	kernel_sgecp_8_3_lib8, .-kernel_sgecp_8_3_lib8
#endif

// end





// void kernel_sgecpsc_8_4_lib8(int k, float alpha, float *A, int sda, float *B);
//                              1      2            3         4        5

	.p2align 4,,15
#if defined(OS_LINUX)
	.globl kernel_sgecpsc_8_4_lib8
	.type kernel_sgecpsc_8_4_lib8, @function
kernel_sgecpsc_8_4_lib8:
#elif defined(OS_MAC)
	.globl _kernel_sgecpsc_8_4_lib8
_kernel_sgecpsc_8_4_lib8:
#elif defined(OS_WINDOWS)
	.globl kernel_sgecpsc_8_4_lib8
	.def kernel_sgecpsc_8_4_lib8; .scl 2; .type 32; .endef
kernel_sgecpsc_8_4_lib8:
#endif

	PROLOGUE

	// call inner dgemm kernel nt

	movq	ARG1, %r10 // k
	movq	ARG2, %r11 // float
	movq	ARG3, %r12 // A

	movq	ARG4, %r13 // sda
	sall	$5, %r13d  // sda *= 32
	movq	ARG5, %r14 // B

#if MACRO_LEVEL>=1
	INNER_KERNEL_SGECPSC_8_4_LIB8
#else
#if defined(OS_LINUX) | defined(OS_WINDOWS)
	call inner_kernel_sgecpsc_8_4_lib8
#elif defined(OS_MAC)
	callq _inner_kernel_sgecpsc_8_4_lib8
#endif
#endif

	EPILOGUE

	ret

#if defined(OS_LINUX)
	.size	kernel_sgecpsc_8_4_lib8, .-kernel_sgecp_8_4_lib8
#endif

// end





// void kernel_sgecp_8_4_lib8(int k, float *A, int sda, float *B);
//                            1      2         3        4
	.p2align 4,,15
#if defined(OS_LINUX)
	.globl kernel_sgecp_8_4_lib8
	.type kernel_sgecp_8_4_lib8, @function
kernel_sgecp_8_4_lib8:
#elif defined(OS_MAC)
	.globl _kernel_sgecp_8_4_lib8
_kernel_sgecp_8_4_lib8:
#elif defined(OS_WINDOWS)
	.globl kernel_sgecp_8_4_lib8
	.def kernel_sgecp_8_4_lib8; .scl 2; .type 32; .endef
kernel_sgecp_8_4_lib8:
#endif

	PROLOGUE

	// call inner dgemm kernel nt

	movq	ARG1, %r10 // k
	movq	ARG2, %r11  // A
	movq	ARG3, %r12  // 8*sda*sizeof(float)
	sall	$5, %r12d
	movq	ARG4, %r13  // B

#if MACRO_LEVEL>=1
	INNER_KERNEL_SGECP_8_4_LIB8
#else
#if defined(OS_LINUX) | defined(OS_WINDOWS)
	call inner_kernel_sgecp_8_4_lib8
#elif defined(OS_MAC)
	callq _inner_kernel_sgecp_8_4_lib8
#endif
#endif

	EPILOGUE

	ret

#if defined(OS_LINUX)
	.size	kernel_sgecp_8_4_lib8, .-kernel_sgecp_8_4_lib8
#endif
//
// end





// void kernel_sgecpsc_8_5_lib8(int k, float alpha, float *A, int sda, float *B);
//                              1      2            3         4        5

	.p2align 4,,15
#if defined(OS_LINUX)
	.globl kernel_sgecpsc_8_5_lib8
	.type kernel_sgecpsc_8_5_lib8, @function
kernel_sgecpsc_8_5_lib8:
#elif defined(OS_MAC)
	.globl _kernel_sgecpsc_8_5_lib8
_kernel_sgecpsc_8_5_lib8:
#elif defined(OS_WINDOWS)
	.globl kernel_sgecpsc_8_5_lib8
	.def kernel_sgecpsc_8_5_lib8; .scl 2; .type 32; .endef
kernel_sgecpsc_8_5_lib8:
#endif

	PROLOGUE

	// call inner dgemm kernel nt

	movq	ARG1, %r10 // k
	movq	ARG2, %r11 // float
	movq	ARG3, %r12 // A

	movq	ARG4, %r13 // sda
	sall	$5, %r13d  // sda *= 32
	movq	ARG5, %r14 // B

#if MACRO_LEVEL>=1
	INNER_KERNEL_SGECPSC_8_5_LIB8
#else
#if defined(OS_LINUX) | defined(OS_WINDOWS)
	call inner_kernel_sgecpsc_8_5_lib8
#elif defined(OS_MAC)
	callq _inner_kernel_sgecpsc_8_5_lib8
#endif
#endif

	EPILOGUE

	ret

#if defined(OS_LINUX)
	.size	kernel_sgecpsc_8_5_lib8, .-kernel_sgecp_8_5_lib8
#endif

// end





// void kernel_sgecp_8_5_lib8(int k, float *A, int sda, float *B);
//                            1      2         3        4
	.p2align 4,,15
#if defined(OS_LINUX)
	.globl kernel_sgecp_8_5_lib8
	.type kernel_sgecp_8_5_lib8, @function
kernel_sgecp_8_5_lib8:
#elif defined(OS_MAC)
	.globl _kernel_sgecp_8_5_lib8
_kernel_sgecp_8_5_lib8:
#elif defined(OS_WINDOWS)
	.globl kernel_sgecp_8_5_lib8
	.def kernel_sgecp_8_5_lib8; .scl 2; .type 32; .endef
kernel_sgecp_8_5_lib8:
#endif

	PROLOGUE

	// call inner dgemm kernel nt

	movq	ARG1, %r10 // k
	movq	ARG2, %r11  // A
	movq	ARG3, %r12  // 8*sda*sizeof(float)
	sall	$5, %r12d
	movq	ARG4, %r13  // B

#if MACRO_LEVEL>=1
	INNER_KERNEL_SGECP_8_5_LIB8
#else
#if defined(OS_LINUX) | defined(OS_WINDOWS)
	call inner_kernel_sgecp_8_5_lib8
#elif defined(OS_MAC)
	callq _inner_kernel_sgecp_8_5_lib8
#endif
#endif

	EPILOGUE

	ret

#if defined(OS_LINUX)
	.size	kernel_sgecp_8_5_lib8, .-kernel_sgecp_8_5_lib8
#endif
//
// end





// void kernel_sgecpsc_8_6_lib8(int k, float alpha, float *A, int sda, float *B);
//                              1      2            3         4        5

	.p2align 4,,15
#if defined(OS_LINUX)
	.globl kernel_sgecpsc_8_6_lib8
	.type kernel_sgecpsc_8_6_lib8, @function
kernel_sgecpsc_8_6_lib8:
#elif defined(OS_MAC)
	.globl _kernel_sgecpsc_8_6_lib8
_kernel_sgecpsc_8_6_lib8:
#elif defined(OS_WINDOWS)
	.globl kernel_sgecpsc_8_6_lib8
	.def kernel_sgecpsc_8_6_lib8; .scl 2; .type 32; .endef
kernel_sgecpsc_8_6_lib8:
#endif

	PROLOGUE

	// call inner dgemm kernel nt

	movq	ARG1, %r10 // k
	movq	ARG2, %r11 // float
	movq	ARG3, %r12 // A

	movq	ARG4, %r13 // sda
	sall	$5, %r13d  // sda *= 32
	movq	ARG5, %r14 // B

#if MACRO_LEVEL>=1
	INNER_KERNEL_SGECPSC_8_6_LIB8
#else
#if defined(OS_LINUX) | defined(OS_WINDOWS)
	call inner_kernel_sgecpsc_8_6_lib8
#elif defined(OS_MAC)
	callq _inner_kernel_sgecpsc_8_6_lib8
#endif
#endif

	EPILOGUE

	ret

#if defined(OS_LINUX)
	.size	kernel_sgecpsc_8_6_lib8, .-kernel_sgecp_8_6_lib8
#endif

// end





// void kernel_sgecp_8_6_lib8(int k, float *A, int sda, float *B);
//                            1      2         3        4

	.p2align 4,,15
#if defined(OS_LINUX)
	.globl kernel_sgecp_8_6_lib8
	.type kernel_sgecp_8_6_lib8, @function
kernel_sgecp_8_6_lib8:
#elif defined(OS_MAC)
	.globl _kernel_sgecp_8_6_lib8
_kernel_sgecp_8_6_lib8:
#elif defined(OS_WINDOWS)
	.globl kernel_sgecp_8_6_lib8
	.def kernel_sgecp_8_6_lib8; .scl 2; .type 32; .endef
kernel_sgecp_8_6_lib8:
#endif

	PROLOGUE

	// call inner dgemm kernel nt

	movq	ARG1, %r10 // k
	movq	ARG2, %r11  // A
	movq	ARG3, %r12  // 8*sda*sizeof(float)
	sall	$5, %r12d
	movq	ARG4, %r13  // B

#if MACRO_LEVEL>=1
	INNER_KERNEL_SGECP_8_6_LIB8
#else
#if defined(OS_LINUX) | defined(OS_WINDOWS)
	call inner_kernel_sgecp_8_6_lib8
#elif defined(OS_MAC)
	callq _inner_kernel_sgecp_8_6_lib8
#endif
#endif

	EPILOGUE

	ret

#if defined(OS_LINUX)
	.size	kernel_sgecp_8_6_lib8, .-kernel_sgecp_8_6_lib8
#endif
//
// end





// void kernel_sgecpsc_8_7_lib8(int k, float alpha, float *A, int sda, float *B);
//                              1      2            3         4        5

	.p2align 4,,15
#if defined(OS_LINUX)
	.globl kernel_sgecpsc_8_7_lib8
	.type kernel_sgecpsc_8_7_lib8, @function
kernel_sgecpsc_8_7_lib8:
#elif defined(OS_MAC)
	.globl _kernel_sgecpsc_8_7_lib8
_kernel_sgecpsc_8_7_lib8:
#elif defined(OS_WINDOWS)
	.globl kernel_sgecpsc_8_7_lib8
	.def kernel_sgecpsc_8_7_lib8; .scl 2; .type 32; .endef
kernel_sgecpsc_8_7_lib8:
#endif

	PROLOGUE

	// call inner dgemm kernel nt

	movq	ARG1, %r10 // k
	movq	ARG2, %r11 // float
	movq	ARG3, %r12 // A

	movq	ARG4, %r13 // sda
	sall	$5, %r13d  // sda *= 32
	movq	ARG5, %r14 // B

#if MACRO_LEVEL>=1
	INNER_KERNEL_SGECPSC_8_7_LIB8
#else
#if defined(OS_LINUX) | defined(OS_WINDOWS)
	call inner_kernel_sgecpsc_8_7_lib8
#elif defined(OS_MAC)
	callq _inner_kernel_sgecpsc_8_7_lib8
#endif
#endif

	EPILOGUE

	ret

#if defined(OS_LINUX)
	.size	kernel_sgecpsc_8_7_lib8, .-kernel_sgecp_8_7_lib8
#endif

// end





// void kernel_sgecp_8_7_lib8(int k, float *A, int sda, float *B);
//                            1      2         3        4

	.p2align 4,,15
#if defined(OS_LINUX)
	.globl kernel_sgecp_8_7_lib8
	.type kernel_sgecp_8_7_lib8, @function
kernel_sgecp_8_7_lib8:
#elif defined(OS_MAC)
	.globl _kernel_sgecp_8_7_lib8
_kernel_sgecp_8_7_lib8:
#elif defined(OS_WINDOWS)
	.globl kernel_sgecp_8_7_lib8
	.def kernel_sgecp_8_7_lib8; .scl 2; .type 32; .endef
kernel_sgecp_8_7_lib8:
#endif

	PROLOGUE

	// call inner dgemm kernel nt

	movq	ARG1, %r10 // k
	movq	ARG2, %r11  // A
	movq	ARG3, %r12  // 8*sda*sizeof(float)
	sall	$5, %r12d
	movq	ARG4, %r13  // B

#if MACRO_LEVEL>=1
	INNER_KERNEL_SGECP_8_7_LIB8
#else
#if defined(OS_LINUX) | defined(OS_WINDOWS)
	call inner_kernel_sgecp_8_7_lib8
#elif defined(OS_MAC)
	callq _inner_kernel_sgecp_8_7_lib8
#endif
#endif

	EPILOGUE

	ret

#if defined(OS_LINUX)
	.size	kernel_sgecp_8_7_lib8, .-kernel_sgecp_8_7_lib8
#endif
//
// end

// Variable number of rows

// src matrix aligned





// void kernel_sgecpsc_8_0_gen_lib8(int k, float *alpha, float *A, float *B, int m1);
//                                  1      2             3         4         5
	.p2align 4,,15
#if defined(OS_LINUX)
	.globl kernel_sgecpsc_8_0_gen_lib8
	.type kernel_sgecpsc_8_0_gen_lib8, @function
kernel_sgecpsc_8_0_gen_lib8:
#elif defined(OS_MAC)
	.globl _kernel_sgecpsc_8_0_gen_lib8
_kernel_sgecpsc_8_0_gen_lib8:
#elif defined(OS_WINDOWS)
	.globl kernel_sgecpsc_8_0_gen_lib8
	.def kernel_sgecpsc_8_0_gen_lib8; .scl 2; .type 32; .endef
kernel_sgecpsc_8_0_gen_lib8:
#endif

	PROLOGUE

	// call inner dgemm kernel nt

	movq	ARG1, %r10 // k
	movq	ARG2, %r11 // alpha
	movq	ARG3, %r12 // A
	movq	ARG4, %r13 // B
	movq	ARG5, %r14 // m1

#if MACRO_LEVEL>=1
	INNER_KERNEL_SGECPSC_8_0_GEN_LIB8
#else
#if defined(OS_LINUX) | defined(OS_WINDOWS)
	call inner_kernel_sgecpsc_8_0_gen_lib8
#elif defined(OS_MAC)
	callq _inner_kernel_sgecpsc_8_0_gen_lib8
#endif
#endif

	EPILOGUE

	ret

#if defined(OS_LINUX)
	.size	kernel_sgecpsc_8_0_gen_lib8, .-kernel_sgecpsc_8_0_gen_lib8
#endif
//
// end





// void kernel_sgecpsc_8_0_gen_u_lib8(int k, float *alpha, float *A, float *B, int m1);
//                                    1      2             3         4         5
	.p2align 4,,15
#if defined(OS_LINUX)
	.globl kernel_sgecpsc_8_0_gen_u_lib8
	.type kernel_sgecpsc_8_0_gen_u_lib8, @function
kernel_sgecpsc_8_0_gen_u_lib8:
#elif defined(OS_MAC)
	.globl _kernel_sgecpsc_8_0_gen_u_lib8
_kernel_sgecpsc_8_0_gen_u_lib8:
#elif defined(OS_WINDOWS)
	.globl kernel_sgecpsc_8_0_gen_u_lib8
	.def kernel_sgecpsc_8_0_gen_u_lib8; .scl 2; .type 32; .endef
kernel_sgecpsc_8_0_gen_u_lib8:
#endif

	PROLOGUE

	// call inner dgemm kernel nt

	movq	ARG1, %r10 // k
	movq	ARG2, %r11 // alpha
	movq	ARG3, %r12 // A
	movq	ARG4, %r13 // B
	movq	ARG5, %r14 // m1

#if MACRO_LEVEL>=1
	INNER_KERNEL_SGECPSC_8_0_GEN_U_LIB8
#else
#if defined(OS_LINUX) | defined(OS_WINDOWS)
	call inner_kernel_sgecpsc_8_0_gen_u_lib8
#elif defined(OS_MAC)
	callq _inner_kernel_sgecpsc_8_0_gen_u_lib8
#endif
#endif

	EPILOGUE

	ret

#if defined(OS_LINUX)
	.size	kernel_sgecpsc_8_0_gen_u_lib8, .-kernel_sgecpsc_8_0_gen_u_lib8
#endif
//
// end





// void kernel_sgecp_8_0_gen_lib8(int k, float *A, float *B, int m1);
//                                1      2         3         4
	.p2align 4,,15
#if defined(OS_LINUX)
	.globl kernel_sgecp_8_0_gen_lib8
	.type kernel_sgecp_8_0_gen_lib8, @function
kernel_sgecp_8_0_gen_lib8:
#elif defined(OS_MAC)
	.globl _kernel_sgecp_8_0_gen_lib8
_kernel_sgecp_8_0_gen_lib8:
#elif defined(OS_WINDOWS)
	.globl kernel_sgecp_8_0_gen_lib8
	.def kernel_sgecp_8_0_gen_lib8; .scl 2; .type 32; .endef
kernel_sgecp_8_0_gen_lib8:
#endif

	PROLOGUE

	// call inner dgemm kernel nt

	movq	ARG1, %r10 // k
	movq	ARG2, %r11 // A
	movq	ARG3, %r12 // B
	movq	ARG4, %r13 // m1

#if MACRO_LEVEL>=1
	INNER_KERNEL_SGECP_8_0_GEN_LIB8
#else
#if defined(OS_LINUX) | defined(OS_WINDOWS)
	call inner_kernel_sgecp_8_0_gen_lib8
#elif defined(OS_MAC)
	callq _inner_kernel_sgecp_8_0_gen_lib8
#endif
#endif

	EPILOGUE

	ret

#if defined(OS_LINUX)
	.size	kernel_sgecp_8_0_gen_lib8, .-kernel_sgecp_8_0_gen_lib8
#endif
//
// end





// void kernel_sgecp_8_0_gen_u_lib8(int k, float *A, float *B, int m1);
//                                1      2         3         4
	.p2align 4,,15
#if defined(OS_LINUX)
	.globl kernel_sgecp_8_0_gen_u_lib8
	.type kernel_sgecp_8_0_gen_u_lib8, @function
kernel_sgecp_8_0_gen_u_lib8:
#elif defined(OS_MAC)
	.globl _kernel_sgecp_8_0_gen_u_lib8
_kernel_sgecp_8_0_gen_u_lib8:
#elif defined(OS_WINDOWS)
	.globl kernel_sgecp_8_0_gen_u_lib8
	.def kernel_sgecp_8_0_gen_u_lib8; .scl 2; .type 32; .endef
kernel_sgecp_8_0_gen_u_lib8:
#endif

	PROLOGUE

	// call inner dgemm kernel nt

	movq	ARG1, %r10 // k
	movq	ARG2, %r11 // A
	movq	ARG3, %r12 // B
	movq	ARG4, %r13 // m1

#if MACRO_LEVEL>=1
	INNER_KERNEL_SGECP_8_0_GEN_U_LIB8
#else
#if defined(OS_LINUX) | defined(OS_WINDOWS)
	call inner_kernel_sgecp_8_0_gen_u_lib8
#elif defined(OS_MAC)
	callq _inner_kernel_sgecp_8_0_gen_u_lib8
#endif
#endif

	EPILOGUE

	ret

#if defined(OS_LINUX)
	.size	kernel_sgecp_8_0_gen_u_lib8, .-kernel_sgecp_8_0_gen_u_lib8
#endif
//
// end





// void kernel_sgesc_8_0_gen_lib8(int k, float *alpha, float *A, int m1);
//                                1      2             3         4
	.p2align 4,,15
#if defined(OS_LINUX)
	.globl kernel_sgesc_8_0_gen_lib8
	.type kernel_sgesc_8_0_gen_lib8, @function
kernel_sgesc_8_0_gen_lib8:
#elif defined(OS_MAC)
	.globl _kernel_sgesc_8_0_gen_lib8
_kernel_sgesc_8_0_gen_lib8:
#elif defined(OS_WINDOWS)
	.globl kernel_sgesc_8_0_gen_lib8
	.def kernel_sgesc_8_0_gen_lib8; .scl 2; .type 32; .endef
kernel_sgesc_8_0_gen_lib8:
#endif

	PROLOGUE

	movq	ARG1, %r10 // k
	movq	ARG2, %r11 // alpha
	movq	ARG3, %r12 // A
	movq	ARG3, %r13 // A
	movq	ARG4, %r14 // m1

#if MACRO_LEVEL>=1
	INNER_KERNEL_SGECPSC_8_0_GEN_LIB8
#else
#if defined(OS_LINUX) | defined(OS_WINDOWS)
	call inner_kernel_sgecpsc_8_0_gen_lib8
#elif defined(OS_MAC)
	callq _inner_kernel_sgecpsc_8_0_gen_lib8
#endif
#endif

	EPILOGUE

	ret

#if defined(OS_LINUX)
	.size	kernel_sgesc_8_0_gen_lib8, .-kernel_sgesc_8_0_gen_lib8
#endif
//
// end





// void kernel_sgesc_8_0_gen_u_lib8(int k, float *alpha, float *A, int m1);
//                                  1      2             3         4
	.p2align 4,,15
#if defined(OS_LINUX)
	.globl kernel_sgesc_8_0_gen_u_lib8
	.type kernel_sgesc_8_0_gen_u_lib8, @function
kernel_sgesc_8_0_gen_u_lib8:
#elif defined(OS_MAC)
	.globl _kernel_sgesc_8_0_gen_u_lib8
_kernel_sgesc_8_0_gen_u_lib8:
#elif defined(OS_WINDOWS)
	.globl kernel_sgesc_8_0_gen_u_lib8
	.def kernel_sgesc_8_0_gen_u_lib8; .scl 2; .type 32; .endef
kernel_sgesc_8_0_gen_u_lib8:
#endif

	PROLOGUE

	movq	ARG1, %r10 // k
	movq	ARG2, %r11 // alpha
	movq	ARG3, %r12 // A
	movq	ARG3, %r13 // A
	movq	ARG4, %r14 // m1

#if MACRO_LEVEL>=1
	INNER_KERNEL_SGECPSC_8_0_GEN_U_LIB8
#else
#if defined(OS_LINUX) | defined(OS_WINDOWS)
	call inner_kernel_sgecpsc_8_0_gen_u_lib8
#elif defined(OS_MAC)
	callq _inner_kernel_sgecpsc_8_0_gen_u_lib8
#endif
#endif

	EPILOGUE

	ret

#if defined(OS_LINUX)
	.size	kernel_sgesc_8_0_gen_u_lib8, .-kernel_sgesc_8_0_gen_u_lib8
#endif
//
// end

// src matrix not aligned




// void kernel_sgecpsc_8_1_gen_lib8(int k, float *alpha, float *A, int sda, float *B, int m1);
//                                  1      2             3         4        5         6
	.p2align 4,,15
#if defined(OS_LINUX)
	.globl kernel_sgecpsc_8_1_gen_lib8
	.type kernel_sgecpsc_8_1_gen_lib8, @function
kernel_sgecpsc_8_1_gen_lib8:
#elif defined(OS_MAC)
	.globl _kernel_sgecpsc_8_1_gen_lib8
_kernel_sgecpsc_8_1_gen_lib8:
#elif defined(OS_WINDOWS)
	.globl kernel_sgecpsc_8_1_gen_lib8
	.def kernel_sgecpsc_8_1_gen_lib8; .scl 2; .type 32; .endef
kernel_sgecpsc_8_1_gen_lib8:
#endif

	PROLOGUE

	// call inner dgemm kernel nt

	movq	ARG1, %r10 // k
	movq	ARG2, %r11 // alpha
	movq	ARG3, %r12 // A
	movq	ARG4, %r13 // sda
	sall	$5, %r13d  // sda *= 32
	movq	ARG5, %r14 // B
	movq	ARG6, %r15 // m1

#if MACRO_LEVEL>=1
	INNER_KERNEL_SGECPSC_8_1_GEN_LIB8
#else
#if defined(OS_LINUX) | defined(OS_WINDOWS)
	call inner_kernel_sgecpsc_8_1_gen_lib8
#elif defined(OS_MAC)
	callq _inner_kernel_sgecpsc_8_1_gen_lib8
#endif
#endif

	EPILOGUE

	ret

#if defined(OS_LINUX)
	.size	kernel_sgecpsc_8_1_gen_lib8, .-kernel_sgecpsc_8_1_gen_lib8
#endif
//
// end





// void kernel_sgecp_8_1_gen_lib8(int k, float *A, int sda, float *B, int m0);
//                                1      2         3        4         5

	.p2align 4,,15
#if defined(OS_LINUX)
	.globl kernel_sgecp_8_1_gen_lib8
	.type kernel_sgecp_8_1_gen_lib8, @function
kernel_sgecp_8_1_gen_lib8:
#elif defined(OS_MAC)
	.globl _kernel_sgecp_8_1_gen_lib8
_kernel_sgecp_8_1_gen_lib8:
#elif defined(OS_WINDOWS)
	.globl kernel_sgecp_8_1_gen_lib8
	.def kernel_sgecp_8_1_gen_lib8; .scl 2; .type 32; .endef
kernel_sgecp_8_1_gen_lib8:
#endif

	PROLOGUE

	// call inner dgemm kernel nt

	movq	ARG1, %r10 // k
	movq	ARG2, %r11  // A
	movq	ARG3, %r12  // 8*sda*sizeof(float)
	sall	$5, %r12d
	movq	ARG4, %r13  // B
	movq	ARG5, %r14 // m1

#if MACRO_LEVEL>=1
	INNER_KERNEL_SGECP_8_1_GEN_LIB8
#else
#if defined(OS_LINUX) | defined(OS_WINDOWS)
	call inner_kernel_sgecp_8_1_gen_lib8
#elif defined(OS_MAC)
	callq _inner_kernel_sgecp_8_1_gen_lib8
#endif
#endif

	EPILOGUE

	ret

#if defined(OS_LINUX)
	.size	kernel_sgecp_8_1_gen_lib8, .-kernel_sgecp_8_1_gen_lib8
#endif

// end





// void kernel_sgecpsc_8_2_gen_lib8(int k, float *alpha, float *A, int sda, float *B, int m1);
//                                  1      2             3         4        5         6
	.p2align 4,,15
#if defined(OS_LINUX)
	.globl kernel_sgecpsc_8_2_gen_lib8
	.type kernel_sgecpsc_8_2_gen_lib8, @function
kernel_sgecpsc_8_2_gen_lib8:
#elif defined(OS_MAC)
	.globl _kernel_sgecpsc_8_2_gen_lib8
_kernel_sgecpsc_8_2_gen_lib8:
#elif defined(OS_WINDOWS)
	.globl kernel_sgecpsc_8_2_gen_lib8
	.def kernel_sgecpsc_8_2_gen_lib8; .scl 2; .type 32; .endef
kernel_sgecpsc_8_2_gen_lib8:
#endif

	PROLOGUE

	// call inner dgemm kernel nt

	movq	ARG1, %r10 // k
	movq	ARG2, %r11 // alpha
	movq	ARG3, %r12 // A
	movq	ARG4, %r13 // sda
	sall	$5, %r13d  // sda*8*sizeof(float)
	movq	ARG5, %r14 // B
	movq	ARG6, %r15 // m1

#if MACRO_LEVEL>=1
	INNER_KERNEL_SGECPSC_8_2_GEN_LIB8
#else
#if defined(OS_LINUX) | defined(OS_WINDOWS)
	call inner_kernel_sgecpsc_8_2_gen_lib8
#elif defined(OS_MAC)
	callq _inner_kernel_sgecpsc_8_2_gen_lib8
#endif
#endif

	EPILOGUE

	ret

#if defined(OS_LINUX)
	.size	kernel_sgecpsc_8_2_gen_lib8, .-kernel_sgecpsc_8_2_gen_lib8
#endif
//
// end





// void kernel_sgecp_8_2_gen_lib8(int k, float *A, int sda, float *B, int m0);
//                                1      2         3        4         5

	.p2align 4,,15
#if defined(OS_LINUX)
	.globl kernel_sgecp_8_2_gen_lib8
	.type kernel_sgecp_8_2_gen_lib8, @function
kernel_sgecp_8_2_gen_lib8:
#elif defined(OS_MAC)
	.globl _kernel_sgecp_8_2_gen_lib8
_kernel_sgecp_8_2_gen_lib8:
#elif defined(OS_WINDOWS)
	.globl kernel_sgecp_8_2_gen_lib8
	.def kernel_sgecp_8_2_gen_lib8; .scl 2; .type 32; .endef
kernel_sgecp_8_2_gen_lib8:
#endif

	PROLOGUE

	// call inner dgemm kernel nt

	movq	ARG1, %r10 // k
	movq	ARG2, %r11 // A
	movq	ARG3, %r12 // 8*sda*sizeof(float)
	sall	$5, %r12d
	movq	ARG4, %r13 // B
	movq	ARG5, %r14 // m1

#if MACRO_LEVEL>=1
	INNER_KERNEL_SGECP_8_2_GEN_LIB8
#else
#if defined(OS_LINUX) | defined(OS_WINDOWS)
	call inner_kernel_sgecp_8_2_gen_lib8
#elif defined(OS_MAC)
	callq _inner_kernel_sgecp_8_2_gen_lib8
#endif
#endif

	EPILOGUE

	ret

#if defined(OS_LINUX)
	.size	kernel_sgecp_8_2_gen_lib8, .-kernel_sgecp_8_2_gen_lib8
#endif

// end





// void kernel_sgecpsc_8_3_gen_lib8(int k, float *alpha, float *A, int sda, float *B, int m1);
//                                  1      2             3         4        5         6
	.p2align 4,,15
#if defined(OS_LINUX)
	.globl kernel_sgecpsc_8_3_gen_lib8
	.type kernel_sgecpsc_8_3_gen_lib8, @function
kernel_sgecpsc_8_3_gen_lib8:
#elif defined(OS_MAC)
	.globl _kernel_sgecpsc_8_3_gen_lib8
_kernel_sgecpsc_8_3_gen_lib8:
#elif defined(OS_WINDOWS)
	.globl kernel_sgecpsc_8_3_gen_lib8
	.def kernel_sgecpsc_8_3_gen_lib8; .scl 2; .type 32; .endef
kernel_sgecpsc_8_3_gen_lib8:
#endif

	PROLOGUE

	// call inner dgemm kernel nt

	movq	ARG1, %r10 // k
	movq	ARG2, %r11 // alpha
	movq	ARG3, %r12 // A
	movq	ARG4, %r13 // sda
	sall	$5, %r13d  // sda*8*sizeof(float)
	movq	ARG5, %r14 // B
	movq	ARG6, %r15 // m1

#if MACRO_LEVEL>=1
	INNER_KERNEL_SGECPSC_8_3_GEN_LIB8
#else
#if defined(OS_LINUX) | defined(OS_WINDOWS)
	call inner_kernel_sgecpsc_8_3_gen_lib8
#elif defined(OS_MAC)
	callq _inner_kernel_sgecpsc_8_3_gen_lib8
#endif
#endif

	EPILOGUE

	ret

#if defined(OS_LINUX)
	.size	kernel_sgecpsc_8_3_gen_lib8, .-kernel_sgecpsc_8_3_gen_lib8
#endif
//
// end





// void kernel_sgecp_8_3_gen_lib8(int k, float *A, int sda, float *B, int m0);
//                                1      2         3        4         5

	.p2align 4,,15
#if defined(OS_LINUX)
	.globl kernel_sgecp_8_3_gen_lib8
	.type kernel_sgecp_8_3_gen_lib8, @function
kernel_sgecp_8_3_gen_lib8:
#elif defined(OS_MAC)
	.globl _kernel_sgecp_8_3_gen_lib8
_kernel_sgecp_8_3_gen_lib8:
#elif defined(OS_WINDOWS)
	.globl kernel_sgecp_8_3_gen_lib8
	.def kernel_sgecp_8_3_gen_lib8; .scl 2; .type 32; .endef
kernel_sgecp_8_3_gen_lib8:
#endif

	PROLOGUE

	// call inner dgemm kernel nt

	movq	ARG1, %r10 // k
	movq	ARG2, %r11 // A
	movq	ARG3, %r12 // sda
	sall	$5, %r12d  // 8*sda*sizeof(float)
	movq	ARG4, %r13 // B
	movq	ARG5, %r14 // m1

#if MACRO_LEVEL>=1
	INNER_KERNEL_SGECP_8_3_GEN_LIB8
#else
#if defined(OS_LINUX) | defined(OS_WINDOWS)
	call inner_kernel_sgecp_8_3_gen_lib8
#elif defined(OS_MAC)
	callq _inner_kernel_sgecp_8_3_gen_lib8
#endif
#endif

	EPILOGUE

	ret

#if defined(OS_LINUX)
	.size	kernel_sgecp_8_3_gen_lib8, .-kernel_sgecp_8_3_gen_lib8
#endif

// end





// void kernel_sgecpsc_8_4_gen_lib8(int k, float *alpha, float *A, int sda, float *B, int m1);
//                                  1      2             3         4        5         6
	.p2align 4,,15
#if defined(OS_LINUX)
	.globl kernel_sgecpsc_8_4_gen_lib8
	.type kernel_sgecpsc_8_4_gen_lib8, @function
kernel_sgecpsc_8_4_gen_lib8:
#elif defined(OS_MAC)
	.globl _kernel_sgecpsc_8_4_gen_lib8
_kernel_sgecpsc_8_4_gen_lib8:
#elif defined(OS_WINDOWS)
	.globl kernel_sgecpsc_8_4_gen_lib8
	.def kernel_sgecpsc_8_4_gen_lib8; .scl 2; .type 32; .endef
kernel_sgecpsc_8_4_gen_lib8:
#endif

	PROLOGUE

	// call inner dgemm kernel nt

	movq	ARG1, %r10 // k
	movq	ARG2, %r11 // alpha
	movq	ARG3, %r12 // A
	movq	ARG4, %r13 // sda
	sall	$5, %r13d  // sda*8*sizeof(float)
	movq	ARG5, %r14 // B
	movq	ARG6, %r15 // m1

#if MACRO_LEVEL>=1
	INNER_KERNEL_SGECPSC_8_4_GEN_LIB8
#else
#if defined(OS_LINUX) | defined(OS_WINDOWS)
	call inner_kernel_sgecpsc_8_4_gen_lib8
#elif defined(OS_MAC)
	callq _inner_kernel_sgecpsc_8_4_gen_lib8
#endif
#endif

	EPILOGUE

	ret

#if defined(OS_LINUX)
	.size	kernel_sgecpsc_8_4_gen_lib8, .-kernel_sgecpsc_8_4_gen_lib8
#endif
//
// end





// void kernel_sgecp_8_4_gen_lib8(int k, float *A, int sda, float *B, int m0);
//                                1      2         3        4         5
	.p2align 4,,15
#if defined(OS_LINUX)
	.globl kernel_sgecp_8_4_gen_lib8
	.type kernel_sgecp_8_4_gen_lib8, @function
kernel_sgecp_8_4_gen_lib8:
#elif defined(OS_MAC)
	.globl _kernel_sgecp_8_4_gen_lib8
_kernel_sgecp_8_4_gen_lib8:
#elif defined(OS_WINDOWS)
	.globl kernel_sgecp_8_4_gen_lib8
	.def kernel_sgecp_8_4_gen_lib8; .scl 2; .type 32; .endef
kernel_sgecp_8_4_gen_lib8:
#endif

	PROLOGUE

	// call inner dgemm kernel nt

	movq	ARG1, %r10 // k
	movq	ARG2, %r11  // A
	movq	ARG3, %r12  // 8*sda*sizeof(float)
	sall	$5, %r12d
	movq	ARG4, %r13  // B
	movq	ARG5, %r14 // m1

#if MACRO_LEVEL>=1
	INNER_KERNEL_SGECP_8_4_GEN_LIB8
#else
#if defined(OS_LINUX) | defined(OS_WINDOWS)
	call inner_kernel_sgecp_8_4_gen_lib8
#elif defined(OS_MAC)
	callq _inner_kernel_sgecp_8_4_gen_lib8
#endif
#endif

	EPILOGUE

	ret

#if defined(OS_LINUX)
	.size	kernel_sgecp_8_4_gen_lib8, .-kernel_sgecp_8_4_gen_lib8
#endif
//
// end





// void kernel_sgecpsc_8_5_gen_lib8(int k, float *alpha, float *A, int sda, float *B, int m1);
//                                  1      2             3         4        5         6
	.p2align 4,,15
#if defined(OS_LINUX)
	.globl kernel_sgecpsc_8_5_gen_lib8
	.type kernel_sgecpsc_8_5_gen_lib8, @function
kernel_sgecpsc_8_5_gen_lib8:
#elif defined(OS_MAC)
	.globl _kernel_sgecpsc_8_5_gen_lib8
_kernel_sgecpsc_8_5_gen_lib8:
#elif defined(OS_WINDOWS)
	.globl kernel_sgecpsc_8_5_gen_lib8
	.def kernel_sgecpsc_8_5_gen_lib8; .scl 2; .type 32; .endef
kernel_sgecpsc_8_5_gen_lib8:
#endif

	PROLOGUE

	// call inner dgemm kernel nt

	movq	ARG1, %r10 // k
	movq	ARG2, %r11 // alpha
	movq	ARG3, %r12 // A
	movq	ARG4, %r13 // sda
	sall	$5, %r13d  // sda*8*sizeof(float)
	movq	ARG5, %r14 // B
	movq	ARG6, %r15 // m1

#if MACRO_LEVEL>=1
	INNER_KERNEL_SGECPSC_8_5_GEN_LIB8
#else
#if defined(OS_LINUX) | defined(OS_WINDOWS)
	call inner_kernel_sgecpsc_8_5_gen_lib8
#elif defined(OS_MAC)
	callq _inner_kernel_sgecpsc_8_5_gen_lib8
#endif
#endif

	EPILOGUE

	ret

#if defined(OS_LINUX)
	.size	kernel_sgecpsc_8_5_gen_lib8, .-kernel_sgecpsc_8_5_gen_lib8
#endif
//
// end





// void kernel_sgecp_8_5_gen_lib8(int k, float *A, int sda, float *B, int m0);
//                                1      2         3        4         5
	.p2align 4,,15
#if defined(OS_LINUX)
	.globl kernel_sgecp_8_5_gen_lib8
	.type kernel_sgecp_8_5_gen_lib8, @function
kernel_sgecp_8_5_gen_lib8:
#elif defined(OS_MAC)
	.globl _kernel_sgecp_8_5_gen_lib8
_kernel_sgecp_8_5_gen_lib8:
#elif defined(OS_WINDOWS)
	.globl kernel_sgecp_8_5_gen_lib8
	.def kernel_sgecp_8_5_gen_lib8; .scl 2; .type 32; .endef
kernel_sgecp_8_5_gen_lib8:
#endif

	PROLOGUE

	// call inner dgemm kernel nt

	movq	ARG1, %r10 // k
	movq	ARG2, %r11  // A
	movq	ARG3, %r12  // 8*sda*sizeof(float)
	sall	$5, %r12d
	movq	ARG4, %r13  // B
	movq	ARG5, %r14 // m1

#if MACRO_LEVEL>=1
	INNER_KERNEL_SGECP_8_5_GEN_LIB8
#else
#if defined(OS_LINUX) | defined(OS_WINDOWS)
	call inner_kernel_sgecp_8_5_gen_lib8
#elif defined(OS_MAC)
	callq _inner_kernel_sgecp_8_5_gen_lib8
#endif
#endif

	EPILOGUE

	ret

#if defined(OS_LINUX)
	.size	kernel_sgecp_8_5_gen_lib8, .-kernel_sgecp_8_5_gen_lib8
#endif
//
// end





// void kernel_sgecpsc_8_6_gen_lib8(int k, float *alpha, float *A, int sda, float *B, int m1);
//                                  1      2             3         4        5         6
	.p2align 4,,15
#if defined(OS_LINUX)
	.globl kernel_sgecpsc_8_6_gen_lib8
	.type kernel_sgecpsc_8_6_gen_lib8, @function
kernel_sgecpsc_8_6_gen_lib8:
#elif defined(OS_MAC)
	.globl _kernel_sgecpsc_8_6_gen_lib8
_kernel_sgecpsc_8_6_gen_lib8:
#elif defined(OS_WINDOWS)
	.globl kernel_sgecpsc_8_6_gen_lib8
	.def kernel_sgecpsc_8_6_gen_lib8; .scl 2; .type 32; .endef
kernel_sgecpsc_8_6_gen_lib8:
#endif

	PROLOGUE

	// call inner dgemm kernel nt

	movq	ARG1, %r10 // k
	movq	ARG2, %r11 // alpha
	movq	ARG3, %r12 // A
	movq	ARG4, %r13 // sda
	sall	$5, %r13d  // sda*8*sizeof(float)
	movq	ARG5, %r14 // B
	movq	ARG6, %r15 // m1

#if MACRO_LEVEL>=1
	INNER_KERNEL_SGECPSC_8_6_GEN_LIB8
#else
#if defined(OS_LINUX) | defined(OS_WINDOWS)
	call inner_kernel_sgecpsc_8_6_gen_lib8
#elif defined(OS_MAC)
	callq _inner_kernel_sgecpsc_8_6_gen_lib8
#endif
#endif

	EPILOGUE

	ret

#if defined(OS_LINUX)
	.size	kernel_sgecpsc_8_6_gen_lib8, .-kernel_sgecpsc_8_6_gen_lib8
#endif
//
// end





// void kernel_sgecp_8_6_gen_lib8(int k, float *A, int sda, float *B, int m0);
//                                1      2         3        4         5

	.p2align 4,,15
#if defined(OS_LINUX)
	.globl kernel_sgecp_8_6_gen_lib8
	.type kernel_sgecp_8_6_gen_lib8, @function
kernel_sgecp_8_6_gen_lib8:
#elif defined(OS_MAC)
	.globl _kernel_sgecp_8_6_gen_lib8
_kernel_sgecp_8_6_gen_lib8:
#elif defined(OS_WINDOWS)
	.globl kernel_sgecp_8_6_gen_lib8
	.def kernel_sgecp_8_6_gen_lib8; .scl 2; .type 32; .endef
kernel_sgecp_8_6_gen_lib8:
#endif

	PROLOGUE

	// call inner dgemm kernel nt

	movq	ARG1, %r10 // k
	movq	ARG2, %r11  // A
	movq	ARG3, %r12  // 8*sda*sizeof(float)
	sall	$5, %r12d
	movq	ARG4, %r13  // B
	movq	ARG5, %r14 // m1

#if MACRO_LEVEL>=1
	INNER_KERNEL_SGECP_8_6_GEN_LIB8
#else
#if defined(OS_LINUX) | defined(OS_WINDOWS)
	call inner_kernel_sgecp_8_6_gen_lib8
#elif defined(OS_MAC)
	callq _inner_kernel_sgecp_8_6_gen_lib8
#endif
#endif

	EPILOGUE

	ret

#if defined(OS_LINUX)
	.size	kernel_sgecp_8_6_gen_lib8, .-kernel_sgecp_8_6_gen_lib8
#endif
//
// end





// void kernel_sgecpsc_8_7_gen_lib8(int k, float *alpha, float *A, int sda, float *B, int m1);
//                                  1      2             3         4        5         6
	.p2align 4,,15
#if defined(OS_LINUX)
	.globl kernel_sgecpsc_8_7_gen_lib8
	.type kernel_sgecpsc_8_7_gen_lib8, @function
kernel_sgecpsc_8_7_gen_lib8:
#elif defined(OS_MAC)
	.globl _kernel_sgecpsc_8_7_gen_lib8
_kernel_sgecpsc_8_7_gen_lib8:
#elif defined(OS_WINDOWS)
	.globl kernel_sgecpsc_8_7_gen_lib8
	.def kernel_sgecpsc_8_7_gen_lib8; .scl 2; .type 32; .endef
kernel_sgecpsc_8_7_gen_lib8:
#endif

	PROLOGUE

	// call inner dgemm kernel nt

	movq	ARG1, %r10 // k
	movq	ARG2, %r11 // alpha
	movq	ARG3, %r12 // A
	movq	ARG4, %r13 // sda
	sall	$5, %r13d  // sda*8*sizeof(float)
	movq	ARG5, %r14 // B
	movq	ARG6, %r15 // m1

#if MACRO_LEVEL>=1
	INNER_KERNEL_SGECPSC_8_7_GEN_LIB8
#else
#if defined(OS_LINUX) | defined(OS_WINDOWS)
	call inner_kernel_sgecpsc_8_7_gen_lib8
#elif defined(OS_MAC)
	callq _inner_kernel_sgecpsc_8_7_gen_lib8
#endif
#endif

	EPILOGUE

	ret

#if defined(OS_LINUX)
	.size	kernel_sgecpsc_8_7_gen_lib8, .-kernel_sgecpsc_8_7_gen_lib8
#endif
//
// end





// void kernel_sgecp_8_7_gen_lib8(int k, float *A, int sda, float *B, int m0);
//                                1      2         3        4         5
	.p2align 4,,15
#if defined(OS_LINUX)
	.globl kernel_sgecp_8_7_gen_lib8
	.type kernel_sgecp_8_7_gen_lib8, @function
kernel_sgecp_8_7_gen_lib8:
#elif defined(OS_MAC)
	.globl _kernel_sgecp_8_7_gen_lib8
_kernel_sgecp_8_7_gen_lib8:
#elif defined(OS_WINDOWS)
	.globl kernel_sgecp_8_7_gen_lib8
	.def kernel_sgecp_8_7_gen_lib8; .scl 2; .type 32; .endef
kernel_sgecp_8_7_gen_lib8:
#endif

	PROLOGUE

	// call inner dgemm kernel nt

	movq	ARG1, %r10 // k
	movq	ARG2, %r11  // A
	movq	ARG3, %r12  // 8*sda*sizeof(float)
	sall	$5, %r12d
	movq	ARG4, %r13  // B
	movq	ARG5, %r14 // m1

#if MACRO_LEVEL>=1
	INNER_KERNEL_SGECP_8_7_GEN_LIB8
#else
#if defined(OS_LINUX) | defined(OS_WINDOWS)
	call inner_kernel_sgecp_8_7_gen_lib8
#elif defined(OS_MAC)
	callq _inner_kernel_sgecp_8_7_gen_lib8
#endif
#endif

	EPILOGUE

	ret

#if defined(OS_LINUX)
	.size	kernel_sgecp_8_7_gen_lib8, .-kernel_sgecp_8_7_gen_lib8
#endif
//
// end





// read-only data

#if defined(OS_LINUX)
	.section	.rodata.cst32,"aM",@progbits,32
#elif defined(OS_MAC)
	.section	__TEXT,__const
#elif defined(OS_WINDOWS)
	.section .rdata,"dr"
#endif

#if defined(OS_LINUX) | defined(OS_WINDOWS)
	.align 32
.LC00: // { 7.5 6.5 5.5 4.5 3.5 2.5 1.5 0.5 }
#elif defined(OS_MAC)
	.align 5
LC00: // { 7.5 6.5 5.5 4.5 3.5 2.5 1.5 0.5 }
#endif
	.long	1056964608
	.long	1069547520
	.long	1075838976
	.long	1080033280
	.long	1083179008
	.long	1085276160
	.long	1087373312
	.long	1089470464

#if defined(OS_LINUX) | defined(OS_WINDOWS)
	.align 32
.LC01: // { 15.5 14.5 13.5 12.5 11.5 10.5 9.5 8.5 }
#elif defined(OS_MAC)
	.align 5
LC01: // { 15.5 14.5 13.5 12.5 11.5 10.5 9.5 8.5 }
#endif
	.long	1091043328
	.long	1092091904
	.long	1093140480
	.long	1094189056
	.long	1095237632
	.long	1096286208
	.long	1097334784
	.long	1098383360

#if defined(OS_LINUX) | defined(OS_WINDOWS)
	.align 32
.LC02: // { 23.5 22.5 21.5 20.5 19.5 18.5 17.5 16.5 }
#elif defined(OS_MAC)
	.align 5
LC02: // { 23.5 22.5 21.5 20.5 19.5 18.5 17.5 16.5 }
#endif
	.long	1099169792
	.long	1099694080
	.long	1100218368
	.long	1100742656
	.long	1101266944
	.long	1101791232
	.long	1102315520
	.long	1102839808

#if defined(OS_LINUX) | defined(OS_WINDOWS)
	.align 32
.LC03: // { -1.0 -1.0 1.0 1.0 1.0 1.0 1.0 1.0 }
#elif defined(OS_MAC)
	.align 5
LC03: // { -1.0 -1.0 1.0 1.0 1.0 1.0 1.0 1.0 }
#endif
	.long	1065353216
	.long	1065353216
	.long	1065353216
	.long	1065353216
	.long	1065353216
	.long	1065353216
	.long	3212836864
	.long	3212836864



#if defined(OS_LINUX)
	.section	.note.GNU-stack,"",@progbits
#elif defined(OS_MAC)
	.subsections_via_symbols
#endif
//EOF
